In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=pd.read_csv('C:/Users/Jay/Desktop/Python/EdvancerData/Data/census_income.csv')

In [3]:
df.head()
#The dependent variable that needs to be predicted is Y i.e. based on the data we need to check if an individual would make more than 50K or less than it

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Cleaning data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
Y                 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
#checking relation between education and education num
pd.crosstab(df['education'],df['education.num'])

#after checking the tabular format we conclude that both are related and one can be dropped, we choose to drop 'education' as it is object type
del df['education']

In [6]:
#Checking for other object type columns 
cat_cols=df.select_dtypes(include=['object']).columns
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country', 'Y'],
      dtype='object')

In [7]:
#we can convert Y to 0,1 based on the 50K condition
df['Y'].value_counts()

df['Y'].dtype

df['Y']=np.where(df['Y']==" <=50K",0,1)

df['Y'].value_counts()

0    24720
1     7841
Name: Y, dtype: int64

In [8]:
#Among these, we'll check the number of unique values each column takes so that we can decide how they can be converted to int
df.select_dtypes(include=['object']).nunique().sort_values()

sex                2
race               5
relationship       6
marital.status     7
workclass          9
occupation        15
native.country    42
dtype: int64

In [9]:
#the issue might only be in native country, as it takes 42 unique values, we'd have 41 dummy variables which will increase the data size
#thus, checking the freq of those unique values
counts=df['native.country'].value_counts()

#a lot of the unique values have very low frequency and thus we choose to ignore them and instead consider values with freq>100
df=df[~df['native.country'].isin(counts[counts < 100].index)]

df['native.country'].value_counts()

 United-States    29170
 Mexico             643
 ?                  583
 Philippines        198
 Germany            137
 Canada             121
 Puerto-Rico        114
 El-Salvador        106
 India              100
Name: native.country, dtype: int64

In [10]:
#now that we have limited number of unique values, we will convert these cols to numeric
cat_cols=df.select_dtypes(include=['object']).columns
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')

In [11]:
#converting all to dummies and deleting the original column
for col in cat_cols:
    dummy=pd.get_dummies(df[col],prefix=col,drop_first=True)
    df=pd.concat([df,dummy],axis=1)
    print(col)
    del df[col]
del dummy

workclass
marital.status
occupation
relationship
race
sex
native.country


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31172 entries, 0 to 32560
Data columns (total 53 columns):
age                                      31172 non-null int64
fnlwgt                                   31172 non-null int64
education.num                            31172 non-null int64
capital.gain                             31172 non-null int64
capital.loss                             31172 non-null int64
hours.per.week                           31172 non-null int64
Y                                        31172 non-null int32
workclass_ Federal-gov                   31172 non-null uint8
workclass_ Local-gov                     31172 non-null uint8
workclass_ Never-worked                  31172 non-null uint8
workclass_ Private                       31172 non-null uint8
workclass_ Self-emp-inc                  31172 non-null uint8
workclass_ Self-emp-not-inc              31172 non-null uint8
workclass_ State-gov                     31172 non-null uint8
workclass_ Without-pay 

## Modelling

In [13]:
df_train, df_test=train_test_split(df, test_size=0.2, random_state=2)

In [14]:
x_train = df_train.drop(['Y'],axis=1)
y_train = df_train['Y']

x_test = df_test.drop(['Y'],axis=1)
y_test = df_test['Y']

### Logistic Regression

In [15]:
# logr=LogisticRegression()

# logr.fit(x_train, y_train)

# pred=logr.predict_proba(x_test)[:,1]

# print(roc_auc_score(y_test,pred))

# pred1=logr.predict(x_test)

# logr.score(x_test, y_test)

# cnf_matrix = metrics.confusion_matrix(y_test, pred1)
# cnf_matrix

# accuracy=metrics.accuracy_score(y_test,pred1)
# precision=metrics.precision_score(y_test,pred1)
# recall=metrics.recall_score(y_test,pred1)
# print('acc: ' + str(accuracy) + ' ' + 'precision: ' + str(precision) + ' ' + 'recall: ' + str(recall) )

# rmse=np.sqrt(mean_squared_error(y_test,pred1))
# rmse

# #read: https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

### Logistic Regression w/ hyperparameter tuning

In [16]:
# logr_ht=LogisticRegression()
# logr_ht

# c_range=[0.001,0.01,0.1,1.0,10]

# for i in c_range:
#     logr_ht=LogisticRegression(C=i)
#     logr_ht.fit(x_train, y_train)
#     pred=logr_ht.predict_proba(x_test)[:,1]
#     print(i , ' ' ,  roc_auc_score(y_test, pred))

# #With grid search CV we can check multiple combinations at once
# penalty=['l1','l2']
# c_range=[0.001,0.01,0.1,1.0,10]
# class_weight = [None, 'balanced']

# param_grid = dict(penalty=penalty, C=c_range, class_weight=class_weight)

# grid=GridSearchCV(estimator=logr, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1)

# grid_result=grid.fit(x_train, y_train)

# grid_result.best_score_

# grid_result.best_params_

# #read: https://towardsdatascience.com/hyperparameter-tuning-c5619e7e6624

# #now, creating model with best params and checking the prediction
# logr_best = LogisticRegression(C=1.0, class_weight='balanced', penalty='l1')
# logr_best

# logr_best.fit(x_train, y_train)

# pred_bt=logr_best.predict_proba(x_test)[:,1]

# roc_auc_score(y_test, pred_bt)

### Logistic Regression w/ K Fold cross validation

In [17]:
# logr_k = LogisticRegression()

# cv_result = cross_val_score(logr_k, x_train, y_train,cv=5,scoring='roc_auc', n_jobs=-1)
# #cv_result

# cv_result = cross_val_score(logr_k, x_test, y_test,cv=5,scoring='roc_auc', n_jobs=-1)
# cv_result

### Logistic Regression w/ Hyperparameter and K fold

In [18]:
# logr_best = LogisticRegression(C=1.0, class_weight='balanced', penalty='l1')

# cv_result = cross_val_score(logr_best, x_train, y_train,cv=5,scoring='roc_auc', n_jobs=-1)
# #cv_result

# cv_result = cross_val_score(logr_best, x_test, y_test,cv=5,scoring='roc_auc', n_jobs=-1)
# cv_result

### Logistic Regression w/ Hyperparameter value without checking manually

In [19]:
# logr_ht=LogisticRegression()
# logr_ht
# roc_dict={}

# c_range=[0.001,0.01,0.1,1.0,10]

# for i in c_range:
#     logr_ht=LogisticRegression(C=i)
#     logr_ht.fit(x_train, y_train)
#     pred=logr_ht.predict_proba(x_test)[:,1]
#     r1=roc_auc_score(y_test, pred)
#     roc_dict[i]=r1

# roc_dict

# Keymax = max(roc_dict, key=roc_dict.get) 
# print(Keymax) 

### Decision tree

In [20]:
# dtree=DecisionTreeClassifier()

# dtree.fit(x_train,y_train)

# pred=dtree.predict(x_test)

# print(roc_auc_score(y_test, pred))

# cnf_matrix = metrics.confusion_matrix(y_test, pred)
# cnf_matrix

# accuracy=metrics.accuracy_score(y_test,pred)
# precision=metrics.precision_score(y_test,pred)
# recall=metrics.recall_score(y_test,pred)
# print('acc: ' + str(accuracy) + ' ' + 'precision: ' + str(precision) + ' ' + 'recall: ' + str(recall) )

# rmse=np.sqrt(mean_squared_error(y_test,pred))
# print(rmse)

### Decision Tree + K Fold

In [24]:
# dtree_k = DecisionTreeClassifier()

# cv_result = cross_val_score(dtree_k, x_train, y_train,cv=5,scoring='roc_auc', n_jobs=-1)
# #print(cv_result)

# cv_result = cross_val_score(dtree_k, x_test, y_test,cv=5,scoring='roc_auc', n_jobs=-1)
# cv_result

### Decision tree + Hyper [only max nodes]

In [53]:
# max_node_list = [5,7,10,16,20,30,45,50,70]

# roc_dict = {}

# for i in max_node_list:
#     dtree_m = DecisionTreeClassifier(max_leaf_nodes=i)
#     dtree_m.fit(x_train, y_train)
#     pred=dtree_m.predict(x_test)
#     r1=roc_auc_score(y_test,pred)
#     roc_dict[i]=r1
  
# Keymax = max(roc_dict, key=roc_dict.get) 
# print(Keymax) 
# print(roc_dict)

### Decision tree + Hyper parameters

In [70]:
# param_dist = { 'criterion' : ['gini','entropy'],
#                'max_depth' : [3,5,10,50,None],
#                'min_samples_split':[2,5,10,15],
#                'min_samples_leaf':[1,5,10,15],
#                'class_weight':['balanced',None],
#                 'max_leaf_nodes' :[5,7,10,16,20,30,45,50,70]
#                 }

# dtree_ht=DecisionTreeClassifier()

# grid=GridSearchCV(estimator=dtree_ht, param_grid=param_dist, scoring='roc_auc', verbose=1, n_jobs=-1)

# grid_result=grid.fit(x_train, y_train)

# # print(grid_result.best_score_)

# print(grid_result.best_params_)


#using beat params on train dataset
# dtree_best = DecisionTreeClassifier(criterion='entropy', class_weight='balanced', max_depth=50, min_samples_leaf=45, min_samples_split=2)

# dtree_best.fit(x_train, y_train)

# pred_d_best=dtree_best.predict(x_test)
# pred

# roc_auc_score(y_test, pred)

### Decision tree + Hyper + K Fold

In [72]:
# dtree_best = DecisionTreeClassifier(criterion='entropy', class_weight='balanced', max_depth=50, min_samples_leaf=45, min_samples_split=2)

# cv_result = cross_val_score(dtree_best, x_train, y_train,cv=5,scoring='roc_auc', n_jobs=-1)
# cv_result

# cv_result = cross_val_score(dtree_best, x_test, y_test,cv=5,scoring='roc_auc', n_jobs=-1)
# cv_result