In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('C:/Users/Jay/Desktop/Python/EdvancerData/Data/census_income.csv')

In [3]:
df.head()
#The dependent variable that needs to be predicted is Y i.e. based on the data we need to check if an individual would make more than 50K or less than it

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Cleaning data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    32561 non-null object
Y                 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
#checking relation between education and education num
pd.crosstab(df['education'],df['education.num'])

#after checking the tabular format we conclude that both are related and one can be dropped, we choose to drop 'education' as it is object type
del df['education']

In [6]:
#Checking for other object type columns 
cat_cols=df.select_dtypes(include=['object']).columns
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country', 'Y'],
      dtype='object')

In [7]:
#we can convert Y to 0,1 based on the 50K condition
df['Y'].value_counts()

df['Y'].dtype

df['Y']=np.where(df['Y']==" <=50K",0,1)

df['Y'].value_counts()

0    24720
1     7841
Name: Y, dtype: int64

In [8]:
#Among these, we'll check the number of unique values each column takes so that we can decide how they can be converted to int
df.select_dtypes(include=['object']).nunique().sort_values()

sex                2
race               5
relationship       6
marital.status     7
workclass          9
occupation        15
native.country    42
dtype: int64

In [9]:
#the issue might only be in native country, as it takes 42 unique values, we'd have 41 dummy variables which will increase the data size
#thus, checking the freq of those unique values
counts=df['native.country'].value_counts()

#a lot of the unique values have very low frequency and thus we choose to ignore them and instead consider values with freq>100
df=df[~df['native.country'].isin(counts[counts < 100].index)]

df['native.country'].value_counts()

 United-States    29170
 Mexico             643
 ?                  583
 Philippines        198
 Germany            137
 Canada             121
 Puerto-Rico        114
 El-Salvador        106
 India              100
Name: native.country, dtype: int64

In [10]:
#now that we have limited number of unique values, we will convert these cols to numeric
cat_cols=df.select_dtypes(include=['object']).columns
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')

In [11]:
#converting all to dummies and deleting the original column
for col in cat_cols:
    dummy=pd.get_dummies(df[col],prefix=col,drop_first=True)
    df=pd.concat([df,dummy],axis=1)
    print(col)
    del df[col]
del dummy

workclass
marital.status
occupation
relationship
race
sex
native.country


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31172 entries, 0 to 32560
Data columns (total 53 columns):
age                                      31172 non-null int64
fnlwgt                                   31172 non-null int64
education.num                            31172 non-null int64
capital.gain                             31172 non-null int64
capital.loss                             31172 non-null int64
hours.per.week                           31172 non-null int64
Y                                        31172 non-null int32
workclass_ Federal-gov                   31172 non-null uint8
workclass_ Local-gov                     31172 non-null uint8
workclass_ Never-worked                  31172 non-null uint8
workclass_ Private                       31172 non-null uint8
workclass_ Self-emp-inc                  31172 non-null uint8
workclass_ Self-emp-not-inc              31172 non-null uint8
workclass_ State-gov                     31172 non-null uint8
workclass_ Without-pay 

## Modelling

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_train, df_test=train_test_split(df, test_size=0.2, random_state=2)

In [15]:
x_train = df_train.drop(['Y'],axis=1)
y_train = df_train['Y']

x_test = df_test.drop(['Y'],axis=1)
y_test = df_test['Y']

### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix

In [17]:
logr=LogisticRegression()

In [18]:
logr.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
pred=logr.predict_proba(x_test)[:,1]

In [20]:
roc_auc_score(y_test,pred)

0.5819046162304036

In [21]:
pred1=logr.predict(x_test)

In [22]:
logr.score(x_test, y_test)

0.7932638331996792

In [23]:
from sklearn import metrics

In [24]:
cnf_matrix = metrics.confusion_matrix(y_test, pred1)
cnf_matrix

array([[4545,  154],
       [1135,  401]], dtype=int64)

In [25]:
accuracy=metrics.accuracy_score(y_test,pred1)
precision=metrics.precision_score(y_test,pred1)
recall=metrics.recall_score(y_test,pred1)

In [26]:
from sklearn.metrics import mean_squared_error
rmse=np.sqrt(mean_squared_error(y_test,pred1))
rmse

0.45468249009646367

In [27]:
#read: https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python

### Logistic Regression w/ hyperparameter tuning

In [54]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
logr_ht=LogisticRegression()
logr_ht

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
c_range=[0.001,0.01,0.1,1.0,10]

In [63]:
for i in c_range:
    logr_ht=LogisticRegression(C=i)
    logr_ht.fit(x_train, y_train)
    pred=logr_ht.predict_proba(x_test)[:,1]
    print(i , ' ' ,  roc_auc_score(y_test, pred))

0.001   0.5819489519046606
0.01   0.6022800451780521
0.1   0.6022803222760162
1.0   0.5819046162304036
10   0.6022803222760162


In [75]:
#With grid search CV we can check multiple combinations at once
penalty=['l1','l2']
c_range=[0.001,0.01,0.1,1.0,10]
class_weight = [None, 'balanced']

In [76]:
param_grid = dict(penalty=penalty, C=c_range, class_weight=class_weight)

In [77]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(estimator=logr, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1)

In [78]:
grid_result=grid.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   18.0s finished


In [79]:
grid_result.best_score_

0.9075844560564764

In [81]:
grid_result.best_params_

{'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l1'}

In [82]:
#read: https://towardsdatascience.com/hyperparameter-tuning-c5619e7e6624

In [84]:
#now, creating model with best params and checking the prediction
logr_best = LogisticRegression(C=1.0, class_weight='balanced', penalty='l1')
logr_best

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [85]:
logr_best.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [87]:
pred_bt=logr_best.predict_proba(x_test)[:,1]

In [88]:
roc_auc_score(y_test, pred_bt)

0.9054170435198978