# Import packages

In [118]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr, spearmanr
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [119]:
data = pd.read_csv("adult.csv")
data.columns = ["age","workclass","fnlwgt","education","educational-num","marital-status","occupation","relationship","race","gender","gain","loss","hours","native-country","income"]
data

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,gain,loss,hours,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [120]:
n_records = data.shape[0]
n_greater_50k = data[data['income'] == '>50K'].shape[0]
n_at_most_50k = data[data['income'] == '<=50K'].shape[0]
greater_percent = (n_greater_50k / n_records) * 100
print("Total number of records: {}".format(n_records))
print("Individuals making more than $50,000: {}".format(n_greater_50k))
print("Individuals making at most $50,000: {}".format (n_at_most_50k))
print("Percentage of individuals making more than $50,000: {}%".format(greater_percent))

Total number of records: 48842
Individuals making more than $50,000: 11687
Individuals making at most $50,000: 37155
Percentage of individuals making more than $50,000: 23.928176569346054%


# Missing Value Imputation

In [121]:
col_names = data.columns
num_data = data.shape[0]
for c in col_names:
    num_non = data[c].isin(["?"]).sum()
    if num_non > 0:
        print(c)
        print(num_non)
        print("{0: .2f}%".format(float(num_non) / num_data * 100))

workclass
2799
 5.73%
occupation
2809
 5.75%
native-country
857
 1.75%


In [122]:
data = data[data["workclass"] != "?"]
data = data[data["occupation"] != "?"]
data = data[data["native-country"] != "?"]
data.shape

(45222, 15)

# One Hot Encoding

In [123]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,gain,loss,hours
count,45222.0,45222.0,45222.0,45222.0,45222.0,45222.0
mean,38.547941,189734.7,10.11846,1101.430344,88.595418,40.938017
std,13.21787,105639.2,2.552881,7506.430084,404.956092,12.007508
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,117388.2,9.0,0.0,0.0,40.0
50%,37.0,178316.0,10.0,0.0,0.0,40.0
75%,47.0,237926.0,13.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [124]:
category_col = ['workclass','race','education','marital-status','occupation','relationship','gender','native-country','income']

for c in category_col:
    print(c)
    print(data[c].value_counts())

workclass
workclass
Private             33307
Self-emp-not-inc     3796
Local-gov            3100
State-gov            1946
Self-emp-inc         1646
Federal-gov          1406
Without-pay            21
Name: count, dtype: int64
race
race
White                 38903
Black                  4228
Asian-Pac-Islander     1303
Amer-Indian-Eskimo      435
Other                   353
Name: count, dtype: int64
education
education
HS-grad         14783
Some-college     9899
Bachelors        7570
Masters          2514
Assoc-voc        1959
11th             1619
Assoc-acdm       1507
10th             1223
7th-8th           823
Prof-school       785
9th               676
12th              577
Doctorate         544
5th-6th           449
1st-4th           222
Preschool          72
Name: count, dtype: int64
marital-status
marital-status
Married-civ-spouse       21055
Never-married            14598
Divorced                  6297
Separated                 1411
Widowed                   1277
Married-spous

In [125]:
data["income"].value_counts()[0] / data.shape[0]

0.7521560302507629

In [126]:
data["income"].value_counts()[1] / data.shape[0]

0.2478439697492371

# Normalization

In [127]:
# Split the data into features and target Label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

In [128]:
# Log-transform the skewed features
skewed = ['gain', 'loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

In [129]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical = ['age', 'educational-num', 'gain', 'loss', 'hours']
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])
display(features_log_minmax_transform.head(n = 5))

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,gain,loss,hours,native-country
0,0.109589,Private,226802,11th,0.4,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,0.397959,United-States
1,0.287671,Private,89814,HS-grad,0.533333,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,0.5,United-States
2,0.150685,Local-gov,336951,Assoc-acdm,0.733333,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,0.397959,United-States
3,0.369863,Private,160323,Some-college,0.6,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.777174,0.0,0.397959,United-States
5,0.232877,Private,198693,10th,0.333333,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,0.295918,United-States


In [130]:
features_final = pd.get_dummies(features_log_minmax_transform)
income = income_raw.map({'<=50k' :0, '>50k' :1})
encoded = list(features_final.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))
encoded

104 total features after one-hot encoding.


['age',
 'fnlwgt',
 'educational-num',
 'gain',
 'loss',
 'hours',
 'workclass_Federal-gov',
 'workclass_Local-gov',
 'workclass_Private',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'education_10th',
 'education_11th',
 'education_12th',
 'education_1st-4th',
 'education_5th-6th',
 'education_7th-8th',
 'education_9th',
 'education_Assoc-acdm',
 'education_Assoc-voc',
 'education_Bachelors',
 'education_Doctorate',
 'education_HS-grad',
 'education_Masters',
 'education_Preschool',
 'education_Prof-school',
 'education_Some-college',
 'marital-status_Divorced',
 'marital-status_Married-AF-spouse',
 'marital-status_Married-civ-spouse',
 'marital-status_Married-spouse-absent',
 'marital-status_Never-married',
 'marital-status_Separated',
 'marital-status_Widowed',
 'occupation_Adm-clerical',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Ha

In [131]:
data.replace(['Divorced', 'Married-AF-spouse',
                'Married-civ-spouse', 'Married-spouse-absent',
                'Never-married', 'Separated', 'Widowed'],
            ['not married', 'married', 'married', 'married',
                'not married', 'not married', 'not married'], inplace = True)

In [132]:
for col in category_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,gain,loss,hours,native-country,income
0,25,2,226802,1,7,1,6,3,2,1,0,0,40,38,0
1,38,2,89814,11,9,0,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,12,0,10,0,4,1,0,0,40,38,1
3,44,2,160323,15,10,0,6,0,2,1,7688,0,40,38,1
5,34,2,198693,0,6,1,7,1,4,1,0,0,30,38,0


# Feature Selection

In [133]:
col_names = data.columns
param = []
correlation = []
abs_corr = []

for c in col_names:
    if c != 'income':
        if len(data[c].unique()) <= 2:
            corr = spearmanr(data['income'],data[c])[0]
        else:
            corr = pointbiserialr(data['income'],data[c])[0]
        param.append(c)
        correlation.append(corr)
        abs_corr.append(abs(corr))

param_df = pd.DataFrame({'correlation':correlation,'parameter':param,'abs_corr':abs_corr})
param_df = param_df.sort_values(by=['abs_corr'], ascending=False)
param_df = param_df.set_index('parameter')
param_df

Unnamed: 0_level_0,correlation,abs_corr
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1
marital-status,-0.437678,0.437678
educational-num,0.3328,0.3328
relationship,-0.253402,0.253402
age,0.23704,0.23704
hours,0.227199,0.227199
gain,0.221034,0.221034
gender,0.21576,0.21576
loss,0.148687,0.148687
education,0.081196,0.081196
race,0.070844,0.070844


In [134]:
best_features = param_df.index[0:4].values
print('Best features:\t',best_features)

Best features:	 ['marital-status' 'educational-num' 'relationship' 'age']


# Prediction

In [135]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_final, data['income'], test_size=0.2, random_state= 0)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 36177 samples.
Testing set has 9045 samples.


# Evaluation Metrics

In [136]:
TP = np.sum(data['income'])
FP = data['income'].count() - TP
TN = 0
FN = 0
accuracy = TP / (TP + FP + TN + FN)
recall = TP / (TP + FN)
precision = TP / (TP + FP)
beta = 0.5
fscore = (1 + beta**2) * ((precision * recall) / ((beta**2) * precision + recall))
print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore))

Naive Predictor: [Accuracy score: 0.2478, F-score: 0.2917]


# Navie Bayes Prediction and Random Forest Classifier

In [137]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test):
    results = {}
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    
    results['acc_train'] = accuracy_score (y_train[:300], predictions_train)
    results['acc_test'] = accuracy_score (y_test, predictions_test)
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta=0.5)
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
    print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
    return results


random_state = 42
clf_A = RandomForestClassifier(random_state=random_state)
clf_B = GaussianNB()

samples_100 = len(y_train)
samples_10 = int(len(y_train)/10)
samples_1 = int(len(y_train)/100)


results = {}
for clf in [clf_A, clf_B]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

RandomForestClassifier trained on 361 samples.
RandomForestClassifier trained on 3617 samples.
RandomForestClassifier trained on 36177 samples.
GaussianNB trained on 361 samples.
GaussianNB trained on 3617 samples.
GaussianNB trained on 36177 samples.


In [138]:
# Unoptimized model results
print("Unoptimized Model")
print("----------------")
print("Accuracy score on testing data: {:.4f}".format(results['RandomForestClassifier'][2]['acc_test']))
print("F-score on testing data: {:.4f}".format(results['RandomForestClassifier'][2]['f_test']))
print("")
# Optimized model results
print("Optimized Model")
print("--------------")
best_clf = clf_A  # Assuming clf_A is the optimized model
best_predictions = best_clf.predict(X_test)
best_accuracy = accuracy_score(y_test, best_predictions)
best_fscore = fbeta_score(y_test, best_predictions, beta=0.5)
print("Final accuracy score on the testing data: {:.4f}".format(best_accuracy))
print("Final F-score on the testing data: {:.4f}".format(best_fscore))


Unoptimized Model
----------------
Accuracy score on testing data: 0.8451
F-score on testing data: 0.7007

Optimized Model
--------------
Final accuracy score on the testing data: 0.8451
Final F-score on the testing data: 0.7007


# Logistic Regression

In [139]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(solver='newton-cg')
logistic.fit(X_train, y_train)

y_pred_logistic = logistic.predict(X_test)
y_train_score_logistic = logistic.predict(X_train)

print("Accuracy of the model is: \nTest ", accuracy_score(y_test, y_pred_logistic, normalize=False, sample_weight=None))
print("Train",accuracy_score(y_train, y_train_score_logistic, normalize=False, sample_weight=None))



Accuracy of the model is: 
Test  7541
Train 30581




# Decision Trees Classfier

In [140]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=10)
clf = clf.fit(X_train, y_train)

In [141]:
y_pred_dt = clf.predict(X_test)
y_train_score_dt = clf.predict(X_train)

In [142]:
print("Accuracy of the model is:\nTest ", accuracy_score(y_test, y_pred_dt, normalize=True, sample_weight=None))
print("Train", accuracy_score(y_train, y_train_score_dt, normalize=True, sample_weight=None))

Accuracy of the model is:
Test  0.844776119402985
Train 0.8697238576996434


# GDC

In [143]:
from sklearn. ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier
gdc_model = GradientBoostingClassifier(learning_rate=0.01, random_state=41)
gdc_model.fit(X_train, y_train)
y_pred_gdc = gdc_model.predict(X_test)
y_train_score_gdc = gdc_model.predict(X_train)
print("Accuracy of the model is: \nTest", accuracy_score (y_test, y_pred_gdc, normalize=True, sample_weight=None))
print('Train', accuracy_score (y_train, y_train_score_gdc, normalize=True, sample_weight=None))

Accuracy of the model is: 
Test 0.8326147042564953
Train 0.8439063493379771
