In [None]:
from google.colab import drive
 
# Accessing My Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import sklearn
import sklearn.model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import scipy
from scipy.stats import spearmanr , pointbiserialr
from collections import Counter
import math

In [None]:
data_file_name ="drive/My Drive/Colab Notebooks/adult.csv"
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 
dataset = pd.read_csv(data_file_name,names=features,
        sep=r'\s*,\s*',
        engine='python',
      )

In [None]:
print("Data : ")
print(dataset.shape)
dataset.head()

Data : 
(48842, 15)


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
n_greater_50k = dataset[dataset['Target'] == '>50K'].shape[0]
n_under_50k = dataset[dataset['Target'] == '<=50K'].shape[0]
percentage_greater = ((n_greater_50k)/(n_under_50k))*100
print("Number of all records : {}".format(dataset.shape[0]))
print("Number of records that earn more than 50K : {}".format(n_greater_50k))
print("Number of records that earn less than 50K : {}".format(n_under_50k))

Number of all records : 48842
Number of records that earn more than 50K : 11687
Number of records that earn less than 50K : 37155


# **Preprocess data** 

**Preprocess missing data**

In [None]:
# nan fields
for c in dataset.columns:
  num_nan = dataset[c].isin(["?"]).sum()
  if num_nan > 0:
    print("Column with nan field : {}".format(c))
    print("number of nan fields : {}".format(num_nan))
    print("percentage of records with nan fields : {0:.2f}%".format(float(num_nan)/dataset.shape[0]*100))

Column with nan field : Workclass
number of nan fields : 2799
percentage of records with nan fields : 5.73%
Column with nan field : Occupation
number of nan fields : 2809
percentage of records with nan fields : 5.75%
Column with nan field : Country
number of nan fields : 857
percentage of records with nan fields : 1.75%


Since the missing data percentages are quite small i remove records with missing fields

In [None]:
# omit missing data
dataset=dataset[dataset["Workclass"] != "?"]
dataset=dataset[dataset["Occupation"] != "?"]
dataset=dataset[dataset["Country"] != "?"]
print("Number of all records after removing records with missing fields : {}".format(dataset.shape[0]))

Number of all records after removing records with missing fields : 45222


**Preprocess Numerical Features**

**Binning**

In [None]:
dataset['Age'].unique()

array([39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 30, 23, 32, 34, 25, 43, 40,
       54, 35, 59, 56, 19, 20, 45, 22, 48, 21, 24, 57, 44, 41, 29, 47, 46,
       36, 79, 27, 18, 33, 76, 55, 61, 70, 64, 71, 66, 51, 58, 26, 17, 60,
       90, 75, 65, 77, 62, 63, 67, 74, 72, 69, 68, 73, 81, 78, 88, 80, 84,
       83, 85, 82, 86, 89, 87])

In [None]:
dataset['Age'] = pd.cut(x=dataset['Age'], bins=[10,19, 29, 39, 49,59,69,79,89,99])
# , labels=['10s','20s', '30s', '40s','50s','60s','70s','80s','90s']
dataset.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,"(29, 39]",State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,"(49, 59]",Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,"(29, 39]",Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,"(49, 59]",Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,"(19, 29]",Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
# find numeric features
dataset_numerics = dataset.select_dtypes(include=np.number)
dataset_numerics

Unnamed: 0,fnlwgt,Education-Num,Capital Gain,Capital Loss,Hours per week
0,77516,13,2174,0,40
1,83311,13,0,0,13
2,215646,9,0,0,40
3,234721,7,0,0,40
4,338409,13,0,0,40
...,...,...,...,...,...
48836,245211,13,0,0,40
48837,215419,13,0,0,36
48839,374983,13,0,0,50
48840,83891,13,5455,0,40


In [None]:
# normalize numeric features
for c in dataset_numerics.columns:
        max_value = dataset[c].max()
        min_value = dataset[c].min()
        dataset[c] = (dataset[c] - min_value) / (max_value - min_value)
dataset_numerics = dataset.select_dtypes(include=np.number)
dataset_numerics

Unnamed: 0,fnlwgt,Education-Num,Capital Gain,Capital Loss,Hours per week
0,0.043350,0.800000,0.021740,0.0,0.397959
1,0.047274,0.800000,0.000000,0.0,0.122449
2,0.136877,0.533333,0.000000,0.0,0.397959
3,0.149792,0.400000,0.000000,0.0,0.397959
4,0.219998,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...
48836,0.156895,0.800000,0.000000,0.0,0.397959
48837,0.136723,0.800000,0.000000,0.0,0.357143
48839,0.244762,0.800000,0.000000,0.0,0.500000
48840,0.047666,0.800000,0.054551,0.0,0.397959


**Preprocess Categorical Features**

In [None]:
# find Categorical features
dataset_Categorical = dataset.select_dtypes(exclude=np.number)
dataset_Categorical.head()

Unnamed: 0,Age,Workclass,Education,Martial Status,Occupation,Relationship,Race,Sex,Country,Target
0,"(29, 39]",State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,"(49, 59]",Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,"(29, 39]",Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,"(49, 59]",Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,"(19, 29]",Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


First i label categorical fields so that they become numeric and then i perform feature selection by comparing their corrolation then i will use one hot encoding on data

In [None]:
dataset['Target'] = dataset['Target'].replace('<=50K', 0).replace('>50K', 1)
featureset = dataset
dataset_Categorical = dataset_Categorical.drop('Target',axis=1)
for c in dataset_Categorical.columns:
  featureset[c] = dataset[c].astype('category').cat.codes
featureset.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,2,5,0.04335,9,0.8,4,0,1,4,1,0.02174,0.0,0.397959,38,0
1,4,4,0.047274,9,0.8,2,3,0,4,1,0.0,0.0,0.122449,38,0
2,2,2,0.136877,11,0.533333,0,5,1,4,1,0.0,0.0,0.397959,38,0
3,4,2,0.149792,1,0.4,2,5,0,2,1,0.0,0.0,0.397959,38,0
4,1,2,0.219998,9,0.8,2,9,5,2,0,0.0,0.0,0.397959,4,0


**Feature selection**

In [None]:
corr=[]
feature=[]
for c in featureset.columns:
  if c != 'Target':
    c_tmp = pointbiserialr(featureset['Target'],featureset[c])[0]
    feature.append(c)
    corr.append(abs(c_tmp))
pd.DataFrame({'correlation':corr , 'feature':feature}).sort_values(by=['correlation'], ascending=False).set_index('feature')

Unnamed: 0_level_0,correlation
feature,Unnamed: 1_level_1
Education-Num,0.3328
Relationship,0.253402
Age,0.230161
Hours per week,0.227199
Capital Gain,0.221034
Sex,0.21576
Martial Status,0.192711
Capital Loss,0.148687
Education,0.081196
Race,0.070844


Since fnlwgt has the least corrolation with target , therfore it's an irrelevant feature and we should drop this feature so that the model won't learn based on irrelevant feature

In [None]:
dataset = dataset.drop('fnlwgt',axis=1)

In [None]:
# one hot encoding for rest of categorical features
dataset_Categorical = dataset_Categorical.drop('Age',axis=1)
dataset = pd.get_dummies(dataset, columns=dataset_Categorical.columns)
dataset.head()

Unnamed: 0,Age,Education-Num,Capital Gain,Capital Loss,Hours per week,Target,Workclass_0,Workclass_1,Workclass_2,Workclass_3,Workclass_4,Workclass_5,Workclass_6,Education_0,Education_1,Education_2,Education_3,Education_4,Education_5,Education_6,Education_7,Education_8,Education_9,Education_10,Education_11,Education_12,Education_13,Education_14,Education_15,Martial Status_0,Martial Status_1,Martial Status_2,Martial Status_3,Martial Status_4,Martial Status_5,Martial Status_6,Occupation_0,Occupation_1,Occupation_2,Occupation_3,...,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7,Country_8,Country_9,Country_10,Country_11,Country_12,Country_13,Country_14,Country_15,Country_16,Country_17,Country_18,Country_19,Country_20,Country_21,Country_22,Country_23,Country_24,Country_25,Country_26,Country_27,Country_28,Country_29,Country_30,Country_31,Country_32,Country_33,Country_34,Country_35,Country_36,Country_37,Country_38,Country_39,Country_40
0,2,0.8,0.02174,0.0,0.397959,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,4,0.8,0.0,0.0,0.122449,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,2,0.533333,0.0,0.0,0.397959,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0.4,0.0,0.0,0.397959,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,0.8,0.0,0.0,0.397959,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Split data**

In [None]:
dataset_features=dataset.drop('Target',axis=1)
labels = dataset['Target']
# train 0.6 , test = 0.2 , val = 0.2
x_train, x_test, y_train, y_test  = sklearn.model_selection.train_test_split(dataset_features, labels, test_size=0.2)
x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(x_train, y_train, test_size=0.25) # 0.25 x 0.8 = 0.2

## **SVM**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svclassifier = SVC(kernel='linear')
svclassifier.fit(x_train, y_train)
y_pred = svclassifier.predict(x_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.87      0.94      0.90      6811
           1       0.75      0.57      0.65      2234

    accuracy                           0.85      9045
   macro avg       0.81      0.75      0.78      9045
weighted avg       0.84      0.85      0.84      9045



# **KNN without Library**

In [26]:
def predict(x_train,y_train, x_test,y_test,k):
  pred = []
  for i in range(len(x_test)):
        dist=np.sqrt(np.sum(np.power(np.array(x_train) - np.array(x_test.iloc[i]),2),axis=1))  # euclidian_dist    
        q1, q2 =0 , 0
        for j in  np.array(dist).argsort()[:k]:
          if y_train[j] == 1:
              q1 += 1
          else:
              q2 += 1
            
        if q1 > q2:
            ans = 1
        else:
          ans = 0
        if(i%2000 == 0):
          print("Actual : "+str(y_test[i]) + " Predicted : "+str(ans))
        pred.append(ans)
  return pred

In [27]:
def score(pred, y_test):
  accuracy = ((pred == y_test).sum() / len(y_test) * 100)
  print("Accuracy: %.2f%%" % accuracy)
  return accuracy

In [28]:
x_test=x_test.reset_index(drop=True)
x_train=x_train.reset_index(drop=True)
x_val=x_val.reset_index(drop=True)
y_val=y_val.reset_index(drop=True)
y_train=y_train.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)
result = []
best_accuracy = 0
best_k = 5
for k in range(5,25,3):
  print("For k : {}".format(k))
  predictions = predict(x_train,y_train,x_val,y_val,k)
  accuracy = score(predictions,y_val)
  if(accuracy>best_accuracy):
    best_accuracy = accuracy
    best_k = k

For k : 5
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 1 Predicted : 0
Accuracy: 81.30%
For k : 8
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 1 Predicted : 0
Accuracy: 82.01%
For k : 11
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 1 Predicted : 0
Accuracy: 82.08%
For k : 14
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 1 Predicted : 0
Accuracy: 82.26%
For k : 17
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 1 Predicted : 0
Accuracy: 82.52%
For k : 20
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 1 Predicted : 0
Accuracy: 82.28%
For k : 23
Actual : 0 Predicted : 1
Actual : 0 Predicted : 0
Actual : 0 Predicted : 

In [29]:
print("The best k is : {}".format(best_k))
print("The best accuracy is : %.2f%%" % best_accuracy)

The best k is : 17
The best accuracy is : 82.52%


In [30]:
# predict on test set 
predictions = predict(x_train,y_train,x_test,y_test,best_k)
accuracy = score(predictions,y_test)

Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 0
Actual : 0 Predicted : 1
Actual : 1 Predicted : 0
Accuracy: 83.15%


# **KNN with Library**

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

classifier = KNeighborsClassifier(n_neighbors=best_k)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89      6811
           1       0.70      0.56      0.63      2234

    accuracy                           0.83      9045
   macro avg       0.78      0.74      0.76      9045
weighted avg       0.83      0.83      0.83      9045

