In [603]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
plt.style.use('ggplot')



Importing the dataset

In [604]:
dataset = pd.read_csv('./Adult_Census_Income_Binary_Classification_dataset.csv');

dataset.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [605]:
dataset.shape

(32561, 14)

Replace missing values(?) with NaN

In [606]:
#replace missing data values
data_missing_columns = [' workclass', ' occupation', ' native-country']

for column in data_missing_columns:
    dataset[column] = dataset[column].replace('[?]', np.NaN, regex = True)

Mean value is used for missing data in capital-gain and capital-loss columns 

In [607]:
zero_not_accepted = [' capital-gain', ' capital-loss']

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna = True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

Converting categorical data into numerical values

In [608]:
#converting categorical data
dataset = dataset.dropna()

dataset[' income'] = dataset[' income'].replace({'<=50K': 0, '>50K':1}, regex=True).astype(int)



dataset[' occupation'] = dataset[' occupation'].map({'?': 0, ' Farming-fishing': 1, ' Tech-support': 2, 
                                                       ' Adm-clerical': 3, ' Handlers-cleaners': 4, ' Prof-specialty': 5,
                                                       ' Machine-op-inspct': 6, ' Exec-managerial': 7, 
                                                       ' Priv-house-serv': 8, ' Craft-repair': 9, ' Sales': 10, 
                                                       ' Transport-moving': 11, ' Armed-Forces': 12, ' Other-service': 13, 
                                                       ' Protective-serv': 14}).astype(int)

dataset[' sex'] = dataset[' sex'].map({' Male': 0, ' Female': 1}).astype(int)


dataset[' race'] = dataset[' race'].map({' Black': 0, ' Asian-Pac-Islander': 1, ' Other': 2, ' White': 3, 
                                             ' Amer-Indian-Eskimo': 4}).astype(int)



dataset[' marital-status'] = dataset[' marital-status'].map({' Married-spouse-absent': 0, ' Widowed': 1, 
                                                             ' Married-civ-spouse': 2, ' Separated': 3, ' Divorced': 4, 
                                                             ' Never-married': 5, ' Married-AF-spouse': 6}).astype(int)



dataset[' native-country'] = dataset[' native-country'].map({' Outlying-US(Guam-USVI-etc)': 0, ' Italy': 1, 
                                                             ' Portugal': 2, ' Laos': 3, ' Thailand': 4, ' Iran': 5,
                                                             ' India': 6, ' France': 7, ' Columbia': 8, 
                                                             ' Yugoslavia': 9, ' Dominican-Republic': 10, 
                                                             ' United-States': 11, ' Hong': 12, ' Ecuador': 13, 
                                                             ' Germany': 14, ' Japan': 15, ' Poland': 16, ' South': 17, 
                                                             ' Canada': 18, ' Guatemala': 19, ' China': 20, 
                                                             ' Cambodia': 21, ' Philippines': 22, ' Peru': 23, 
                                                             ' Jamaica': 24, ' Holand-Netherlands': 25, 
                                                             ' Trinadad&Tobago': 26, ' England': 27, ' Haiti': 28, 
                                                             ' Taiwan': 29, ' Vietnam': 30, ' Honduras': 31, 
                                                             ' Puerto-Rico': 32, ' Scotland': 33, ' Greece': 34, 
                                                             ' Cuba': 35, ' Hungary': 36, ' Ireland': 37, ' Mexico': 38, 
                                                             ' Nicaragua': 39, ' El-Salvador': 40}).astype(int)



dataset[' relationship'] = dataset[' relationship'].map({' Unmarried': 0, ' Other-relative': 1, ' Not-in-family': 2, 
                                                             ' Own-child': 3, ' Wife': 4, ' Husband': 5}).astype(int)

dataset[' education'] = dataset[' education'].map({' Preschool': 0, ' 11th': 1, ' Masters': 2, ' Assoc-acdm': 3, 
                                                   ' Doctorate': 4, ' HS-grad': 4, ' 12th': 5, ' 9th': 6, ' 1st-4th': 7, 
                                                   ' Bachelors': 8, ' Assoc-voc': 9, ' Prof-school': 10, ' Some-college': 11, 
                                                   ' 10th': 12, ' 7th-8th': 13, ' 5th-6th': 14}).astype(int)


dataset[' workclass'] = dataset[' workclass'].map({' Self-emp-inc': 0, ' Self-emp-not-inc': 1, ' Without-pay': 2, 
                                                   ' State-gov': 3, ' Private': 4, ' Local-gov': 5, ' Federal-gov': 6}).astype(int)

In [609]:
#split dataset
X = dataset.iloc[:, 0:13]
Y = dataset.iloc[:, 13]
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=11, test_size=0.3)


Finding the best K value for the model

In [None]:
#finding the best k value for the model
neighbors = np.arange(3, 19)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    classifier = KNeighborsClassifier(n_neighbors = k, p = 2)
    
    classifier.fit(x_train, y_train)
    
    train_accuracy[i] = classifier.score(x_train, y_train)
    test_accuracy[i] = classifier.score(x_test, y_test)

In [None]:
#generate plot
%matplotlib inline
plt.title('KNN number of neighbors vs Accuracy')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()



13 is chosen as the K value since it shows the highest testing accuracy and also to make sure we 
choose a value high enough to avoid over fitting

In [610]:
#K is chosen as 13 since the highest testing accuracy is at 13
classifier = KNeighborsClassifier(n_neighbors = 13, p = 2)

In [611]:
#fitting the model
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

Testing and evaluating the model

In [612]:
#testing with the test data set
y_pred = classifier.predict(x_test)

In [613]:
#confusion matrix
cm = confusion_matrix(y_test, y_pred)

print('Confusion Matrix\n')
print(cm)
print('\n')

#confusion matrix
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

Confusion Matrix
[[6301  474]
 [ 845 1429]]


Predicted     0     1   All
True                       
0          6301   474  6775
1           845  1429  2274
All        7146  1903  9049


In [614]:
#full classification report
print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
             precision    recall  f1-score   support

          0       0.88      0.93      0.91      6775
          1       0.75      0.63      0.68      2274

avg / total       0.85      0.85      0.85      9049



In [615]:
#accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy of the model: ' + str(accuracy))

Accuracy of the model: 0.8542380373521936
