In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn import metrics
import keras




In [2]:
df = pd.read_csv('cleaned_census.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head(5)

Unnamed: 0,nchild,nchild_under_5,nsibs,eldch,sex,age,occ1950,ed_group_h
0,0,0,0,0,Male,43,Operative and kindred workers (nec),1
1,0,0,0,0,Male,28,"Managers, officials, and proprietors (nec)",2
2,3,0,0,15,Male,36,Foremen (nec),1
3,0,0,0,0,Male,48,"Managers, officials, and proprietors (nec)",1
4,2,2,0,2,Male,27,Salesmen and sales clerks (nec),1


In [3]:
df['occ1950'] = np.where((df['occ1950'] == 'N/A (blank)'),
                           float('nan'), df['occ1950'])

df['occ1950'] = np.where((df['occ1950'] == 'Occupation missing/unknown'),
                           float('nan'), df['occ1950'])
df['sex'] = np.where((df['sex'] == 'Male'),
                           1, df['sex'])
df['sex'] = np.where((df['sex'] == 'Female'),
                           1, df['sex'])
df = df.dropna()

In [4]:
dummies =  pd.get_dummies(df.occ1950)

In [5]:
census = pd.concat([df, dummies], axis='columns')
census = census.drop(labels=['occ1950' ], axis='columns')
census

Unnamed: 0,nchild,nchild_under_5,nsibs,eldch,sex,age,ed_group_h,Accountants and auditors,Actors and actresses,Advertising agents and salesmen,...,"Tinsmiths, coppersmiths, and sheet metal workers","Tool makers, and die makers and setters",Truck and tractor drivers,Upholsterers,"Ushers, recreation and amusement",Veterinarians,Waiters and waitresses,Watchmen (crossing) and bridge tenders,"Weavers, textile",Welders and flame cutters
0,0,0,0,0,1,43,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,28,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,15,1,36,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,48,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,0,2,1,27,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188046,0,0,0,0,1,53,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188047,0,0,0,0,1,67,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188048,2,0,0,7,1,37,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188049,6,3,0,11,1,29,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
Y = census['ed_group_h']
X = census.drop(labels=['ed_group_h'], axis=1)


In [37]:
#SPLITTING THE DATA into TRAIN and TEST data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.25, random_state=0)

In [38]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(activation='relu', solver='adam', max_iter=500, verbose=1, random_state=1, learning_rate_init=0.005)
mlp.fit(X_train,y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

Iteration 1, loss = 0.92000146
Iteration 2, loss = 0.88766208
Iteration 3, loss = 0.88031504
Iteration 4, loss = 0.87815429
Iteration 5, loss = 0.87592198
Iteration 6, loss = 0.87605665
Iteration 7, loss = 0.87419520
Iteration 8, loss = 0.87323320
Iteration 9, loss = 0.87344031
Iteration 10, loss = 0.87222570
Iteration 11, loss = 0.87199689
Iteration 12, loss = 0.87109810
Iteration 13, loss = 0.87024526
Iteration 14, loss = 0.87060786
Iteration 15, loss = 0.86880100
Iteration 16, loss = 0.86895339
Iteration 17, loss = 0.86894518
Iteration 18, loss = 0.86812756
Iteration 19, loss = 0.86776340
Iteration 20, loss = 0.86803942
Iteration 21, loss = 0.86736630
Iteration 22, loss = 0.86730558
Iteration 23, loss = 0.86713742
Iteration 24, loss = 0.86613069
Iteration 25, loss = 0.86555271
Iteration 26, loss = 0.86653655
Iteration 27, loss = 0.86702459
Iteration 28, loss = 0.86558913
Iteration 29, loss = 0.86545635
Iteration 30, loss = 0.86587672
Iteration 31, loss = 0.86460842
Iteration 32, los

In [39]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_train,predict_train))
print(classification_report(y_train,predict_train))

[[67022  8886   205  1587]
 [17880 13427   465  2479]
 [ 5258  5183   558  2809]
 [ 2313  3170   203  9593]]
              precision    recall  f1-score   support

           1       0.72      0.86      0.79     77700
           2       0.44      0.39      0.41     34251
           3       0.39      0.04      0.07     13808
           4       0.58      0.63      0.60     15279

    accuracy                           0.64    141038
   macro avg       0.53      0.48      0.47    141038
weighted avg       0.61      0.64      0.61    141038



In [40]:
print(confusion_matrix(y_test,predict_test))
print(classification_report(y_test,predict_test))

[[22538  2955    71   543]
 [ 5928  4190   180   837]
 [ 1786  1727   175   923]
 [  773  1092    59  3236]]
              precision    recall  f1-score   support

           1       0.73      0.86      0.79     26107
           2       0.42      0.38      0.40     11135
           3       0.36      0.04      0.07      4611
           4       0.58      0.63      0.60      5160

    accuracy                           0.64     47013
   macro avg       0.52      0.48      0.46     47013
weighted avg       0.60      0.64      0.61     47013



In [41]:
score = metrics.accuracy_score(y_test, predict_test)


In [42]:
print("Accuracy: " + str(score))

Accuracy: 0.64107799970221
