In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from numpy import isnan
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score
import random
!pip install --upgrade category_encoders
import category_encoders as ce
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

Collecting category_encoders
  Downloading category_encoders-2.4.0-py2.py3-none-any.whl (86 kB)
[?25l[K     |███▉                            | 10 kB 23.8 MB/s eta 0:00:01[K     |███████▋                        | 20 kB 30.4 MB/s eta 0:00:01[K     |███████████▍                    | 30 kB 36.4 MB/s eta 0:00:01[K     |███████████████▏                | 40 kB 21.1 MB/s eta 0:00:01[K     |███████████████████             | 51 kB 6.7 MB/s eta 0:00:01[K     |██████████████████████▊         | 61 kB 7.9 MB/s eta 0:00:01[K     |██████████████████████████▌     | 71 kB 8.4 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 81 kB 9.4 MB/s eta 0:00:01[K     |████████████████████████████████| 86 kB 4.1 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.0


  import pandas.util.testing as tm


In [29]:
data = '/content/adult.csv'
random.seed(10)

df = pd.read_csv(data, sep=',')
df.shape
df.head(5)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


# Preprocessing

In [30]:
categorical = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country','income']
df[categorical].isnull().sum()

workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
native-country    0
income            0
dtype: int64

In [31]:
for var in categorical: 
    print(df[var].value_counts())
    print('\n')

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64


HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64


Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: marital-status, dtype: int64


Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical      

In [32]:
df['workclass'].replace('?', np.NaN, inplace=True)
df['native-country'].replace('?', np.NaN, inplace=True)
df['occupation'].replace('?', np.NaN, inplace=True)

df[categorical].isnull().sum()

workclass         2799
education            0
marital-status       0
occupation        2809
relationship         0
race                 0
gender               0
native-country     857
income               0
dtype: int64

In [33]:
df = df.fillna(df.mode().iloc[0])
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [34]:
numerical = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
df[numerical].isnull().sum()

age                0
fnlwgt             0
educational-num    0
capital-gain       0
capital-loss       0
hours-per-week     0
dtype: int64

# Gaussian approach + null score

In [68]:
# Train-test split
X = df.drop(['income'], axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#One-hot encoding
encoder = ce.OneHotEncoder(cols=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

#Scaling
cols = X_train.columns

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

#Fitting the model
gnb = GaussianNB()
gnb.fit(X_train, y_train)

#Predictions
y_pred = gnb.predict(X_test)
y_pred

#Accuracies
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

y_test.value_counts() # <= 50k -> 11138 // > 50k -> 3515
null_accuracy = (11138/(11138+3515))
print('Null accuracy score: {0:0.4f}'. format(null_accuracy))

# Discretization approach (EWD, EFD, PD, FFD and quantiles)

In [61]:
!pip install feature_engine
!pip install -U feature-engine
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.discretisation import EqualWidthDiscretiser

# Varying the number of bins (q) of the EqualFrequencyDiscretizer we can generate the different discretization values. More specifically:
# Quantiles -> q=4 (we divide by quantiles, i.e. four equal frequency bins)
# EFD -> q=10 (standard value for EFD)
# FFD -> q=1628 (total number of instances divided by 30, which is the recommended size of the bins for FFD)
# PD -> q=221 (square root of the number of instances)
# In order to execute the EWD, the function must be uncommented. The standard value are 10 bins
discretizer = EqualFrequencyDiscretiser(q = 221) 
#discretizer = EqualWidthDiscretiser(bins = 10)
discretizer.fit(df)
df_efd = discretizer.fit_transform(df)

cols = ['age', 'fnlwgt', 'educational-num', 'hours-per-week']

for column in cols:
  df_efd[column] = df_efd[column].astype('object')

for var in df_efd.columns: 
    print(df_efd[var].value_counts())
    print('\n')

df_efd.info()

0     1457
18    1348
17    1337
15    1335
5     1329
13    1325
16    1303
19    1280
10    1280
12    1278
21    1264
14    1253
24    1235
9     1232
11    1223
22    1206
6     1206
7     1195
23    1187
4     1178
25    1165
8     1153
2     1113
26    1104
29    1097
28    1096
3     1096
30    1081
27    1067
1     1053
34     877
33     866
32     847
31     845
35     738
36     711
38     621
37     613
39     564
41     555
40     551
42     523
44     450
43     449
45     394
47     340
46     335
48     284
55     257
50     238
54     238
49     237
57     216
51     178
56     157
52     149
53     133
Name: age, dtype: int64


Private             36705
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64


68    227
47    226
87    226
90    225
71    225
     ... 
91    217
88    217
72    217
48    216
69    2

In [62]:
#Train-test split
X = df_efd.drop(['income'], axis=1)
y = df_efd['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#One-hot encoding
encoder = ce.OneHotEncoder(cols=['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'hours-per-week', 'native-country'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

#Model fitting
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

#Predictions
y_pred = mnb.predict(X_test)
y_pred

#Print score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.8366
