In [36]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn import metrics

In [37]:
df = pd.read_csv('cleaned_census.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head(5)

Unnamed: 0,nchild,nchild_under_5,nsibs,eldch,sex,age,occ1950,ed_group_h
0,0,0,0,0,Male,43,Operative and kindred workers (nec),1
1,0,0,0,0,Male,28,"Managers, officials, and proprietors (nec)",2
2,3,0,0,15,Male,36,Foremen (nec),1
3,0,0,0,0,Male,48,"Managers, officials, and proprietors (nec)",1
4,2,2,0,2,Male,27,Salesmen and sales clerks (nec),1


In [38]:
df['occ1950'] = np.where((df['occ1950'] == 'N/A (blank)'),
                           float('nan'), df['occ1950'])

df['occ1950'] = np.where((df['occ1950'] == 'Occupation missing/unknown'),
                           float('nan'), df['occ1950'])
df['sex'] = np.where((df['sex'] == 'Male'),
                           1, df['sex'])
df['sex'] = np.where((df['sex'] == 'Female'),
                           1, df['sex'])
df = df.dropna()

In [39]:
dummies =  pd.get_dummies(df.occ1950)

In [40]:
census = pd.concat([df, dummies], axis='columns')
census = census.drop(labels=['occ1950' ], axis='columns')
census

Unnamed: 0,nchild,nchild_under_5,nsibs,eldch,sex,age,ed_group_h,Accountants and auditors,Actors and actresses,Advertising agents and salesmen,...,"Tinsmiths, coppersmiths, and sheet metal workers","Tool makers, and die makers and setters",Truck and tractor drivers,Upholsterers,"Ushers, recreation and amusement",Veterinarians,Waiters and waitresses,Watchmen (crossing) and bridge tenders,"Weavers, textile",Welders and flame cutters
0,0,0,0,0,1,43,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,28,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,15,1,36,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,48,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,0,2,1,27,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188046,0,0,0,0,1,53,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188047,0,0,0,0,1,67,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188048,2,0,0,7,1,37,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188049,6,3,0,11,1,29,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
Y = census['ed_group_h']
X = census.drop(labels=['ed_group_h'], axis=1)


In [50]:
#SPLITTING THE DATA into TRAIN and TEST data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, random_state=0)

In [43]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [51]:
#Fitting Random Forest Classification to the Training Set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0, verbose=1)
classifier.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    8.5s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=1,
                       warm_start=False)

In [56]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [57]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [58]:
score = metrics.accuracy_score(y_test, y_pred)


In [59]:
print("Accuracy: " + str(score))

Accuracy: 0.5893488607056446
