# **Introduction**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
df = pd.read_csv('../input/adult-income-dataset/adult.csv')

In [3]:
df.head()

# **Clearing Data**

In [4]:
df = df.replace('?',np.NaN)
df = df.dropna(axis = 0)
df = df.reset_index(drop=True)
df.head()

In [5]:

df.info

In [6]:
df['income'].value_counts()

In [7]:
df['income'] = (df['income']==("<=50K")).astype(int)

df['income'].head()

# **Spliting DataFrame to integers and strings**

In [8]:
df_str = df.select_dtypes(include='object')
df_int = df.select_dtypes(exclude='object')
df_str.describe(include='all')

In [9]:
df_str.head()

In [10]:
df_int.head()

In [11]:
df_str = pd.get_dummies(df_str)
df_str.head()

# **Normalizing numeric part**

In [12]:
y_last = df_int['income']
x = df_int.drop(columns='income')
column_names = x.columns.values

In [13]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
x_stndrd = scaler.fit_transform(x)
type(x_stndrd)

Change x_stndrd's type, because we cannot combine str and int data.

In [14]:
x_stndrd = pd.DataFrame(x_stndrd)
x_stndrd.columns = column_names
x_stndrd.head()

In [15]:
x_last = pd.concat([x_stndrd,df_str],axis=1)
x_last.head()

# **Train Test Split**

In [16]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test  = train_test_split(x_last,y_last,test_size=0.30,random_state=42)

In [17]:
from sklearn import tree 

In [18]:
from sklearn.tree import DecisionTreeClassifier
neigh = DecisionTreeClassifier(criterion="gini", random_state=42,max_depth=3, min_samples_leaf=5)   
neigh.fit(x_train,y_train)

In [19]:
model = tree.DecisionTreeClassifier()
model = model.fit(x_train, y_train)

In [20]:
dt = model.predict(x_test)

In [21]:
dt

In [22]:
y_test

In [23]:
metrics.accuracy_score(y_test, dt)

In [24]:
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
confusion_matrix(y_test, dt)

In [25]:
plot_confusion_matrix(neigh,x_test,y_test)

# **Applying KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=4).fit(x_train,y_train)

In [None]:
y_hat = neigh.predict(x_test)

In [None]:
y_hat

In [None]:
y_test

In [None]:

metrics.accuracy_score(y_test,y_hat)

# **Selecting Best 'K' Value**

In [None]:
Ks=30
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(x_train,y_train)
    y_hat = neigh.predict(x_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test,y_hat)
    std_acc[n-1] = np.std(y_hat==y_test)/np.sqrt(y_hat.shape[0])
    
mean_acc

In [None]:
plt.plot(range(1,Ks),mean_acc,'g',marker='o')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
neigh = KNeighborsClassifier(n_neighbors=13).fit(x_train,y_train)
y_hat = neigh.predict(x_test)
metrics.accuracy_score(y_test,y_hat)

# **Confusion Matrix**

In [None]:
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
confusion_matrix(y_test, y_hat)

In [None]:
plot_confusion_matrix(neigh,x_test,y_test)