In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship'
        ,'race','sex','capital-gain','capital-loss','hours-per-week','native-country','Salary']
adult_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/adult.data.txt', names=cols)

In [2]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Handling missing values

In [3]:
for i in adult_data:
    if adult_data[i].dtype == "object":
        print("------------", i, "------------")
        print(adult_data[i].value_counts())

------------ workclass ------------
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
------------ education ------------
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64
------------ marital-status ------------
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse      

Features(workclass, occupation and native-country) have missing values indicated with "?"

Replacing these missing values with mode of those features as these variables are categorical

In [4]:
for i in ("workclass", "occupation", "native-country"):
    mode = adult_data[i].mode()[0]
    adult_data[i] = adult_data[i].replace(" ?", mode)

## Scaling
all the numerical variables (except "education-num") are scaled using min max scaler

In [5]:
def MinMaxScaler(series):
    series = (series - series.min()) / (series.max() - series.min())
    return series
for i in ("age", "fnlwgt", "hours-per-week", "capital-gain", "capital-loss"):
    adult_data[i] = MinMaxScaler(adult_data[i])
adult_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,0.301370,State-gov,0.044302,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,0.021740,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.000000,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.000000,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.000000,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.000000,0.0,0.397959,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,Private,0.166404,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0.000000,0.0,0.377551,United-States,<=50K
32557,0.315068,Private,0.096500,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.000000,0.0,0.397959,United-States,>50K
32558,0.561644,Private,0.094827,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0.000000,0.0,0.397959,United-States,<=50K
32559,0.068493,Private,0.128499,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0.000000,0.0,0.193878,United-States,<=50K


In [6]:
pd.crosstab(adult_data["education"], adult_data["education-num"])

education-num,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10th,0,0,0,0,0,933,0,0,0,0,0,0,0,0,0,0
11th,0,0,0,0,0,0,1175,0,0,0,0,0,0,0,0,0
12th,0,0,0,0,0,0,0,433,0,0,0,0,0,0,0,0
1st-4th,0,168,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5th-6th,0,0,333,0,0,0,0,0,0,0,0,0,0,0,0,0
7th-8th,0,0,0,646,0,0,0,0,0,0,0,0,0,0,0,0
9th,0,0,0,0,514,0,0,0,0,0,0,0,0,0,0,0
Assoc-acdm,0,0,0,0,0,0,0,0,0,0,0,1067,0,0,0,0
Assoc-voc,0,0,0,0,0,0,0,0,0,0,1382,0,0,0,0,0
Bachelors,0,0,0,0,0,0,0,0,0,0,0,0,5355,0,0,0


Above table shows that "education-num" is just enocoded column of ordinal categorical column "education"
hence dropping column "education" to avoid redundant information

In [7]:
del adult_data["education"]

### encoding categorical variables

In [8]:
print("Categorical variables are:")
list_of_categorical = []
for i in adult_data:
    if adult_data[i].dtype == "object":
        print(i)
        list_of_categorical.append(i)

Categorical variables are:
workclass
marital-status
occupation
relationship
race
sex
native-country
Salary


All categorical features are nominal. Using onehotencoder for nominal features

In [9]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
ohe = OneHotEncoder()
ohe.fit(adult_data[list_of_categorical])
ohe.categories_

[array([' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
        ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
       dtype=object),
 array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
        ' Married-spouse-absent', ' Never-married', ' Separated',
        ' Widowed'], dtype=object),
 array([' Adm-clerical', ' Armed-Forces', ' Craft-repair',
        ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
        ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
        ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
        ' Transport-moving'], dtype=object),
 array([' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
        ' Unmarried', ' Wife'], dtype=object),
 array([' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other',
        ' White'], dtype=object),
 array([' Female', ' Male'], dtype=object),
 array([' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba',
        ' Domini

In [11]:
array = ohe.transform(adult_data[list_of_categorical]).toarray()
one_hot_encoded_array = np.array(array, dtype = "int")


In [12]:
one_hot_encoded_array.shape

(32561, 85)

In [13]:
count = 0
for i in ohe.categories_:
    for j in i:
        adult_data[j] = one_hot_encoded_array[:, count]
        count += 1
adult_data.head(30)      

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,...,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,<=50K,>50K
0,0.30137,State-gov,0.044302,13,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,...,0,0,0,0,0,1,0,0,1,0
1,0.452055,Self-emp-not-inc,0.048238,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,0,0,0,0,0,1,0,0,1,0
2,0.287671,Private,0.138113,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,...,0,0,0,0,0,1,0,0,1,0
3,0.493151,Private,0.151068,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,...,0,0,0,0,0,1,0,0,1,0
4,0.150685,Private,0.221488,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,...,0,0,0,0,0,0,0,0,1,0
5,0.273973,Private,0.184932,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,...,0,0,0,0,0,1,0,0,1,0
6,0.438356,Private,0.100448,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,...,0,0,0,0,0,0,0,0,1,0
7,0.479452,Self-emp-not-inc,0.134036,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,...,0,0,0,0,0,1,0,0,0,1
8,0.191781,Private,0.022749,14,Never-married,Prof-specialty,Not-in-family,White,Female,0.140841,...,0,0,0,0,0,1,0,0,0,1
9,0.342466,Private,0.099947,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.051781,...,0,0,0,0,0,1,0,0,0,1


dropping categorical data columns which are not one hot encoded

In [14]:
for i in list_of_categorical:
    del adult_data[i]

In [15]:
adult_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,...,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,<=50K,>50K
0,0.30137,0.044302,13,0.02174,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0.452055,0.048238,13,0.0,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0.287671,0.138113,9,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
3,0.493151,0.151068,7,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
4,0.150685,0.221488,13,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


Dropping Columns "Female" and "<=50k" as these are class of categorical variable with only two classes

In [16]:
del adult_data[" Female"]

In [17]:
del adult_data[" <=50K"]

#### Feature Engineering

columns "capital-gain" and "capital-loss" can be transformed into one column as "net-capital(gain/loss)"

In [18]:
adult_data["net-capital(gain/loss)"] = adult_data["capital-gain"] - adult_data["capital-loss"]

In [19]:
adult_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,...,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,>50K,net-capital(gain/loss)
0,0.30137,0.044302,13,0.02174,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.02174
1,0.452055,0.048238,13,0.0,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.0
2,0.287671,0.138113,9,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0.0
3,0.493151,0.151068,7,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0.0
4,0.150685,0.221488,13,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.0


Dropping "capital-gain" and "capital-loss"

In [20]:
del adult_data["capital-gain"]
del adult_data["capital-loss"]

In [21]:
adult_data.head()

Unnamed: 0,age,fnlwgt,education-num,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,...,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,>50K,net-capital(gain/loss)
0,0.30137,0.044302,13,0.397959,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.02174
1,0.452055,0.048238,13,0.122449,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0.0
2,0.287671,0.138113,9,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0.0
3,0.493151,0.151068,7,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0.0
4,0.150685,0.221488,13,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0


splitting data into features and target

In [22]:
features_columns = list(adult_data.columns)
features_columns.remove(" >50K")
print(features_columns)
print(" >50K" in features_columns)

['age', 'fnlwgt', 'education-num', 'hours-per-week', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay', ' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed', ' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife', ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White', ' Male', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England', ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti', ' Holand-Netherlands', ' Honduras', ' Hong', ' Hungary', ' India', ' Iran', ' Irelan

In [23]:
features = adult_data[features_columns]
features

Unnamed: 0,age,fnlwgt,education-num,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,net-capital(gain/loss)
0,0.301370,0.044302,13,0.397959,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.021740
1,0.452055,0.048238,13,0.122449,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0.000000
2,0.287671,0.138113,9,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.000000
3,0.493151,0.151068,7,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.000000
4,0.150685,0.221488,13,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.166404,12,0.377551,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.000000
32557,0.315068,0.096500,9,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.000000
32558,0.561644,0.094827,9,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.000000
32559,0.068493,0.128499,9,0.193878,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0.000000


In [24]:
target = adult_data[" >50K"]
target

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name:  >50K, Length: 32561, dtype: int32

### Splitting data into train and test split

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
trainX, testX, trainY, testY = train_test_split(features, target)

### Using kNN

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [35]:
knnc = KNeighborsClassifier()

In [36]:
knnc.fit(trainX,trainY)

KNeighborsClassifier()

In [37]:
knnc.score(testX,testY)

0.8213978626704336