In [45]:
# impport libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [46]:
# load datset
df = pd.read_csv('adult.csv')
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# Data Exploration

In [49]:
# check dataset information, descriptive statistics etc

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education.num,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
capital.gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
capital.loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
hours.per.week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0


In [5]:
df.describe(include='O').transpose()

Unnamed: 0,count,unique,top,freq
workclass,32561,9,Private,22696
education,32561,16,HS-grad,10501
marital.status,32561,7,Married-civ-spouse,14976
occupation,32561,15,Prof-specialty,4140
relationship,32561,6,Husband,13193
race,32561,5,White,27816
sex,32561,2,Male,21790
native.country,32561,42,United-States,29170
income,32561,2,<=50K,24720


In [50]:
# check for the cardinality of each categorical column
cat_col = df.select_dtypes(include='O')
for cols in cat_col.columns:
    print(f'The column {cols} has {cat_col[cols].nunique()} variables which are: {cat_col[cols].unique()}' + '\n')

The column workclass has 9 variables which are: ['?' 'Private' 'State-gov' 'Federal-gov' 'Self-emp-not-inc' 'Self-emp-inc'
 'Local-gov' 'Without-pay' 'Never-worked']

The column education has 16 variables which are: ['HS-grad' 'Some-college' '7th-8th' '10th' 'Doctorate' 'Prof-school'
 'Bachelors' 'Masters' '11th' 'Assoc-acdm' 'Assoc-voc' '1st-4th' '5th-6th'
 '12th' '9th' 'Preschool']

The column marital.status has 7 variables which are: ['Widowed' 'Divorced' 'Separated' 'Never-married' 'Married-civ-spouse'
 'Married-spouse-absent' 'Married-AF-spouse']

The column occupation has 15 variables which are: ['?' 'Exec-managerial' 'Machine-op-inspct' 'Prof-specialty'
 'Other-service' 'Adm-clerical' 'Craft-repair' 'Transport-moving'
 'Handlers-cleaners' 'Sales' 'Farming-fishing' 'Tech-support'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']

The column relationship has 6 variables which are: ['Not-in-family' 'Unmarried' 'Own-child' 'Other-relative' 'Husband' 'Wife']

The column race has 5

#### The columns: relationship and race looks good as prediction class for multiclass task while income would be great for binary classification. I would use one of those.\
#### work class and occupation have '?' as entry which is not a vlaid data point\ 
#### Income column also needs some cleaning up

In [51]:
# check for and handle missing values

In [52]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

No explicitly missing values

In [53]:
df[(df['workclass'] == '?')].count()['age']

1836

In [54]:
df[(df['occupation'] == '?')].count()['age']

1843

In [55]:
df[(df['workclass'] == '?') & (df['occupation'] == '?')].count()['age']

1836

In [56]:
(1836/len(df)) * 100, ((1843 - 1836)/len(df)) * 100

(5.638647461687294, 0.021498111237369857)

I will drop rows where both are missing since that's just about 5 % of the data set and fill the rest missing columns which is less than .5% of dat set

# Data preprocessing

In [12]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [57]:
# replace ? with nan
df.replace({'?':np.nan}, inplace=True)

In [58]:
# drop nan values since they are just about 5% of the data set
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30162 non-null  int64 
 1   workclass       30162 non-null  object
 2   fnlwgt          30162 non-null  int64 
 3   education       30162 non-null  object
 4   education.num   30162 non-null  int64 
 5   marital.status  30162 non-null  object
 6   occupation      30162 non-null  object
 7   relationship    30162 non-null  object
 8   race            30162 non-null  object
 9   sex             30162 non-null  object
 10  capital.gain    30162 non-null  int64 
 11  capital.loss    30162 non-null  int64 
 12  hours.per.week  30162 non-null  int64 
 13  native.country  30162 non-null  object
 14  income          30162 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [15]:
df.isna().sum()['age']

0

In [16]:
X = df.drop('income', axis=1)
y = df['income']

In [59]:
# encoding
X = pd.get_dummies(X, drop_first=True)
y = pd.get_dummies(y, drop_first=True)

In [60]:
# train test split
from sklearn.model_selection import train_test_split

In [61]:
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=0.4, random_state=seed)

In [63]:
# scaling
from sklearn.preprocessing import RobustScaler

In [64]:
scaler = RobustScaler()

In [65]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Testing various algorithms for the classification

In [66]:
# decision tree
from sklearn.tree import DecisionTreeClassifier

In [67]:
dtr_model = DecisionTreeClassifier()

In [68]:
dtr_model.fit(X_train,y_train)

DecisionTreeClassifier()

In [69]:
dtr_pred = dtr_model.predict(X_test)

In [70]:
from sklearn.metrics import classification_report

In [71]:
print(classification_report(y_test, dtr_pred))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87      9069
           1       0.62      0.63      0.63      2996

    accuracy                           0.81     12065
   macro avg       0.75      0.75      0.75     12065
weighted avg       0.81      0.81      0.81     12065



In [73]:
# random forest
from sklearn.ensemble import RandomForestClassifier

In [74]:
rfc = RandomForestClassifier()

In [75]:
rfc.fit(X_train, y_train)

  rfc.fit(X_train, y_train)


RandomForestClassifier()

In [76]:
rfc_pred = rfc.predict(X_test)

In [77]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      9069
           1       0.74      0.63      0.68      2996

    accuracy                           0.85     12065
   macro avg       0.81      0.78      0.79     12065
weighted avg       0.85      0.85      0.85     12065



In [78]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

In [79]:
knn = KNeighborsClassifier()

In [80]:
knn.fit(X_train, y_train)

  knn.fit(X_train, y_train)


KNeighborsClassifier()

In [81]:
knn_pred = knn.predict(X_test)

In [82]:
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      9069
           1       0.71      0.68      0.69      2996

    accuracy                           0.85     12065
   macro avg       0.80      0.79      0.80     12065
weighted avg       0.85      0.85      0.85     12065



In [84]:
# SVM
from sklearn.svm import SVC

In [85]:
svc = SVC()

In [None]:
svc.fit(X_train, y_train)

  return f(**kwargs)


In [None]:
svc_pred = svc.predict(X_test)

In [None]:
print(classification_report(y_test, svc_pred))

### The random forest and KNN did better than the other two