In [1]:
# importing different libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# models
import lightgbm as lgb
import xgboost as xgb 

# Evaluation
from datetime import datetime
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_roc_curve


In [16]:
# loading dataset
data = pd.read_csv('adultData.csv', header = None)

In [17]:
# Assigning names to the columns:
data.columns = ['Age', 'WorkClass', 'asdf', 'Education','EducatNum', 
                'MaritalStatus', 'Occupation', 'relation', 'Race', 'Gender',
               'CapitalGain', 'CapitalLoss', 'HrsPrWeek', 'Country', 'Income']
data.head(10)

Unnamed: 0,Age,WorkClass,asdf,Education,EducatNum,MaritalStatus,Occupation,relation,Race,Gender,CapitalGain,CapitalLoss,HrsPrWeek,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [90]:
data.isnull().sum()

Age                0
asdf               0
EducatNum          0
CapitalGain        0
CapitalLoss        0
                  ..
 Not-in-family     0
 Other-relative    0
 Own-child         0
 Unmarried         0
 Wife              0
Length: 100, dtype: int64

In [None]:
data.

In [19]:
data.describe(include = 'all')

Unnamed: 0,Age,WorkClass,asdf,Education,EducatNum,MaritalStatus,Occupation,relation,Race,Gender,CapitalGain,CapitalLoss,HrsPrWeek,Country,Income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [20]:
le = LabelEncoder()
le.fit(data.Income)
data['Income']

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: Income, Length: 32561, dtype: object

In [21]:
le.classes_

array([' <=50K', ' >50K'], dtype=object)

In [22]:
data.Income = pd.Series(le.transform(data.Income))
data.Income.value_counts()

0    24720
1     7841
Name: Income, dtype: int64

In [23]:
data['Income']

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: Income, Length: 32561, dtype: int32

In [24]:
# Convert Categorical features into Numerical features:
cat_wc = pd.get_dummies(data.WorkClass,columns = ['WC'],drop_first=True)
cat_ed = pd.get_dummies(data.Education,columns = ['Ed'],drop_first = True)
cat_Mr = pd.get_dummies(data.MaritalStatus, columns =['Mrtl'],drop_first= True)
cat_ocup = pd.get_dummies(data.Occupation, columns = ['Occ'], drop_first= True)
cat_rel = pd.get_dummies(data.relation,columns = ['rel'], drop_first = True)
cat_race = pd.get_dummies(data.Race, columns= ['Race'], drop_first = True)
cat_contry = pd.get_dummies(data.Country,columns=['Cnt'], drop_first=True)


In [25]:
data.columns

Index(['Age', 'WorkClass', 'asdf', 'Education', 'EducatNum', 'MaritalStatus',
       'Occupation', 'relation', 'Race', 'Gender', 'CapitalGain',
       'CapitalLoss', 'HrsPrWeek', 'Country', 'Income'],
      dtype='object')

In [26]:
Cat_feat = ['WorkClass', 'Education', 'MaritalStatus',
       'Occupation', 'relation', 'Race', 'Gender', 'Country']
def drop_col(df,x): 
    for i in x:
        df.drop(i,axis =1, inplace = True) 
        
drop_col(data,Cat_feat)
data.head()

Unnamed: 0,Age,asdf,EducatNum,CapitalGain,CapitalLoss,HrsPrWeek,Income
0,39,77516,13,2174,0,40,0
1,50,83311,13,0,0,13,0
2,38,215646,9,0,0,40,0
3,53,234721,7,0,0,40,0
4,28,338409,13,0,0,40,0


In [29]:
# Merging categorical tables with data table
data = pd.concat([data,cat_wc,cat_contry,cat_ed,cat_Mr,cat_ocup,cat_race,cat_rel],axis = 1)
data.head()

Unnamed: 0,Age,asdf,EducatNum,CapitalGain,CapitalLoss,HrsPrWeek,Income,Federal-gov,Local-gov,Never-worked,...,Transport-moving,Asian-Pac-Islander,Black,Other,White,Not-in-family,Other-relative,Own-child,Unmarried,Wife
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [30]:
data.columns

Index(['Age', 'asdf', 'EducatNum', 'CapitalGain', 'CapitalLoss', 'HrsPrWeek',
       'Income', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
       ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay',
       ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba',
       ' Dominican-Republic', ' Ecuador', ' El-Salvador', ' England',
       ' France', ' Germany', ' Greece', ' Guatemala', ' Haiti',
       ' Holand-Netherlands', ' Honduras', ' Hong', ' Hungary', ' India',
       ' Iran', ' Ireland', ' Italy', ' Jamaica', ' Japan', ' Laos', ' Mexico',
       ' Nicaragua', ' Outlying-US(Guam-USVI-etc)', ' Peru', ' Philippines',
       ' Poland', ' Portugal', ' Puerto-Rico', ' Scotland', ' South',
       ' Taiwan', ' Thailand', ' Trinadad&Tobago', ' United-States',
       ' Vietnam', ' Yugoslavia', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th',
       ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors',
       ' Doctorate', ' HS-grad', ' Masters', ' Preschool'

In [35]:
# splitting dependent and independent variables:
x = data.drop('Income',axis = 1)
y = data['Income']

In [39]:
# Splitting the Dataset into test and Train set

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state =101, test_size = 0.3, stratify = y)

In [40]:
# Applying Boosing models:
# Data is stored in DMatrix object

dtrain = xgb.DMatrix(x_train, label = y_train)
dtest = xgb.DMatrix(x_test)

In [41]:
# Parameters for xgboost:

parameters = {'max_depth': 8, 'eta': 0.5, 'silent':1, 'objective':'binary:logistic',
              'eval_matric' : 'auc', 'learning_rate': 0.1 }

In [45]:
# train our model:

start = datetime.now()
xgm = xgb.train(parameters, dtrain, num_boost_round= 50)

stop = datetime.now()

Parameters: { eval_matric, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [46]:
exec_time = stop - start
exec_time

datetime.timedelta(seconds=3, microseconds=676911)

In [61]:
# predicting the test set:

y_pred = xgm.predict(dtest)
y_pred = np.array(y_pred)
y_pred

array([0.73768884, 0.40273297, 0.03558855, ..., 0.2948648 , 0.1333308 ,
       0.02035492], dtype=float32)

In [62]:
# converting the probabilities into categorical taking cutoff as 0.5
for i in range(len(x_test)):
    if y_pred[i] > .5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0

In [63]:
y_pred

array([1., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [66]:
# Evaluation Matrics

print('Accuracy score: ',accuracy_score(y_pred,y_test))
print(classification_report(y_test,y_pred), confusion_matrix(y_test,y_pred))

Accuracy score:  0.870713481420821
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7417
           1       0.79      0.63      0.70      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769
 [[7028  389]
 [ 874 1478]]


In [74]:
from sklearn.metrics import roc_auc_score

In [76]:
auc_xgb = roc_auc_score(y_test, y_pred)
auc_xgb

0.7879771397570758

# LGB

In [77]:
train_data = lgb.Dataset(x_train,label = y_train)

In [78]:
# Parameters
param = {'num_leaves': 150, 'objective': 'binary', 'max_depth':7, 'learning_rate':.1, 'max_bin' : 200}
param['metric'] = ['auc', 'binary_logloss']

In [83]:
start = datetime.now()
lgbm = lgb.train(param,train_data,num_boost_round = 50)
stop = datetime.now()

In [84]:
lgbm_time = stop-start
lgbm_time

datetime.timedelta(microseconds=178520)

In [85]:
y_lgb_pred = lgbm.predict(x_test)
y_lgb_pred

array([0.7398974 , 0.40662381, 0.03638935, ..., 0.3433135 , 0.14490757,
       0.0152712 ])

In [86]:
for i in range(len(x_test)):
    if y_lgb_pred[i] > 0.5:
        y_lgb_pred[i] = 1
    else:
        y_lgb_pred[i] = 0
y_lgb_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [87]:
# Evaluation Matrics

print('Accuracy score: ',accuracy_score(y_lgb_pred,y_test))
print(classification_report(y_test,y_lgb_pred), confusion_matrix(y_test,y_lgb_pred))

Accuracy score:  0.8711229399119664
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7417
           1       0.79      0.63      0.70      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769
 [[7030  387]
 [ 872 1480]]


In [88]:
auc_lgb = roc_auc_score(y_test,y_lgb_pred)

In [89]:
auc_lgb

0.7885371352262085