In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pointbiserialr, spearmanr
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data=pd.read_csv("adult-training.csv")

In [3]:
data.head()

Unnamed: 0,age,workclass,fniwgt,education,educational-num,marital-status,occupation,relationship,race,gender,gain,loss,hours,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.describe()

Unnamed: 0,age,fniwgt,educational-num,gain,loss,hours
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
data.shape

(32561, 15)

In [6]:
num_data=data.shape[0]
for c in data.columns:
    num_non=data[c].isin([" ?"]).sum()
    if num_non > 0:
        print(c)
        print(num_non)
        print("{0:.2f}%".format(float(num_non)/num_data*100))

workclass
1836
5.64%
occupation
1843
5.66%
native-country
583
1.79%


In [7]:
data=data[data["workclass"] != " ?"]
data=data[data["occupation"] != " ?"]
data=data[data["native-country"] != " ?"]

In [8]:
data.shape

(30162, 15)

In [9]:
scaler=MinMaxScaler()
numerical=['age','educational-num','gain','loss','hours','fniwgt']
features_minmax_transform=pd.DataFrame(data)
features_minmax_transform[numerical]=scaler.fit_transform(data[numerical])
display(features_minmax_transform.head(n=5))

Unnamed: 0,age,workclass,fniwgt,education,educational-num,marital-status,occupation,relationship,race,gender,gain,loss,hours,native-country,income
0,0.30137,State-gov,0.043338,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.047277,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.137244,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.150212,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.220703,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


In [10]:
data['income']=data['income'].map({' <=50K':0,' >50K':1})
data['income'].head()

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int64

In [11]:
col_names=data.columns
for column in col_names:
    if data[column].dtype==object:
        data[column]=data[column].str.lstrip()
features_final=pd.get_dummies(features_minmax_transform)
encoded=list(features_final.columns)
print("{} total features after one hot encoding".format(len(encoded)))
encoded

105 total features after one hot encoding


['age',
 'fniwgt',
 'educational-num',
 'gain',
 'loss',
 'hours',
 'income',
 'workclass_Federal-gov',
 'workclass_Local-gov',
 'workclass_Private',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'education_10th',
 'education_11th',
 'education_12th',
 'education_1st-4th',
 'education_5th-6th',
 'education_7th-8th',
 'education_9th',
 'education_Assoc-acdm',
 'education_Assoc-voc',
 'education_Bachelors',
 'education_Doctorate',
 'education_HS-grad',
 'education_Masters',
 'education_Preschool',
 'education_Prof-school',
 'education_Some-college',
 'marital-status_Divorced',
 'marital-status_Married-AF-spouse',
 'marital-status_Married-civ-spouse',
 'marital-status_Married-spouse-absent',
 'marital-status_Never-married',
 'marital-status_Separated',
 'marital-status_Widowed',
 'occupation_Adm-clerical',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'oc

In [12]:
col_names=features_final.columns
param=[]
correlation=[]
abs_corr=[]
for c in col_names:
    if c != "income":
        if len(features_final[c].unique()) > 2:
            corr=spearmanr(features_final['income'],features_final[c])[0]
        else:
            corr=pointbiserialr(features_final['income'],features_final[c])[0]
        param.append(c)
        correlation.append(corr)
        abs_corr.append(abs(corr))
param_df=pd.DataFrame({'correlation':correlation,'parameter':param,'abs_corr':abs_corr})
param_df=param_df.sort_values(by=['abs_corr'], ascending=False)
param_df=param_df.set_index('parameter')
param_df

Unnamed: 0_level_0,abs_corr,correlation
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1
marital-status_Married-civ-spouse,0.445418,0.445418
relationship_Husband,0.401236,0.401236
educational-num,0.329910,0.329910
marital-status_Never-married,0.320053,-0.320053
gain,0.277917,0.277917
age,0.276778,0.276778
hours,0.267245,0.267245
relationship_Own-child,0.226186,-0.226186
gender_Female,0.216699,-0.216699
gender_Male,0.216699,0.216699


In [13]:
features_final_income=features_final['income']
features_final=features_final.drop(['income'],axis=1)

In [14]:
best_features=param_df.index[0:35].values
features_final=features_final[best_features]
features_final.columns

Index(['marital-status_Married-civ-spouse', 'relationship_Husband',
       'educational-num', 'marital-status_Never-married', 'gain', 'age',
       'hours', 'relationship_Own-child', 'gender_Female', 'gender_Male',
       'occupation_Exec-managerial', 'relationship_Not-in-family',
       'occupation_Prof-specialty', 'education_Bachelors', 'education_Masters',
       'occupation_Other-service', 'education_Prof-school',
       'relationship_Unmarried', 'loss', 'workclass_Self-emp-inc',
       'education_HS-grad', 'marital-status_Divorced', 'education_Doctorate',
       'relationship_Wife', 'workclass_Private', 'occupation_Adm-clerical',
       'occupation_Handlers-cleaners', 'race _Black', 'race _White',
       'education_11th', 'relationship_Other-relative',
       'occupation_Machine-op-inspct', 'marital-status_Separated',
       'education_10th', 'native-country_Mexico'],
      dtype='object')

In [15]:
features_final=features_final.sample(frac=1)
X_train,X_test,y_train,y_test=train_test_split(features_final,features_final_income,test_size=0.2,random_state=0)
print("Training set has {} samples".format(X_train.shape[0]))
print("Test set has {} samples".format(X_test.shape[0]))

Training set has 24129 samples
Test set has 6033 samples


In [16]:
clf_A=tree.DecisionTreeClassifier(max_depth=10)
clf_A.fit(X_train,y_train)
y_pred_dt=clf_A.predict(X_test)
y_train_score_dt=clf_A.predict(X_train)
print("Accuracy of the test model is:\nTest",accuracy_score(y_test,y_pred_dt,sample_weight=None))
print('Train',accuracy_score(y_train,y_train_score_dt,sample_weight=None))

Accuracy of the test model is:
Test 0.747555113542
Train 0.755978283393


In [17]:
clf_B=RandomForestClassifier(n_estimators=200, max_depth=10)
clf_B.fit(X_train,y_train)
y_pred_RF=clf_B.predict(X_test)
y_train_score_RF=clf_B.predict(X_train)
print("Accuracy for RandomForestClassifierndomForestClassifier: \nTest ",accuracy_score(y_test,y_pred_RF))
print("Train",accuracy_score(y_train,y_train_score_RF))

Accuracy for RandomForestClassifierndomForestClassifier: 
Test  0.751201723852
Train 0.751668117203


In [18]:
clf_C=GaussianNB()
clf_C.fit(X_train,y_train)
y_pred_GNB=clf_C.predict(X_test)
y_train_score_GNB=clf_C.predict(X_train)
print("Accuracy for Gaussian Naive Bayes Classifier : \nTest",accuracy_score(y_test,y_pred_GNB))
print('Train',accuracy_score(y_train,y_train_score_GNB))

Accuracy for Gaussian Naive Bayes Classifier : 
Test 0.732637162274
Train 0.73206514982


In [19]:
clf_D=LogisticRegression(max_iter=10000)
clf_D.fit(X_train,y_train)
y_pred_logistic=clf_D.predict(X_test)
y_train_score_logistic=clf_D.predict(X_train)
print("Accuracy of the test model is:\nTest",accuracy_score(y_test,y_pred_logistic,sample_weight=None))
print('Train',accuracy_score(y_train,y_train_score_logistic,sample_weight=None))

Accuracy of the test model is:
Test 0.751201723852
Train 0.751046458618


In [20]:
clf_E=GradientBoostingClassifier(learning_rate=0.1)
clf_E.fit(X_train,y_train)
y_pred_gdc=clf_E.predict(X_test)
y_train_score_gdc=clf_E.predict(X_train)
print("Accuracy of the test model is:\nTest",accuracy_score(y_test,y_pred_gdc,sample_weight=None))
print('Train',accuracy_score(y_train,y_train_score_gdc,sample_weight=None))

Accuracy of the test model is:
Test 0.750870213824
Train 0.752041112354


In [21]:
clf_F=AdaBoostRegressor(learning_rate=0.1)
clf_F.fit(X_train,y_train)
y_pred_abg=clf_F.predict(X_test)
y_train_score_abg=clf_F.predict(X_train)
print("Accuracy of the test model is:\nTest",accuracy_score(y_test,y_pred_abg.round(),sample_weight=None))
print('Train',accuracy_score(y_train,y_train_score_abg.round(),sample_weight=None))

Accuracy of the test model is:
Test 0.751201723852
Train 0.751046458618
