In [35]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [39]:
data = pd.read_csv("data/train.csv")


In [40]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [41]:
feat_data = data.drop(["fnlwgt","marital-status","relationship"], axis=1).dropna()
feat_test_data =  test_data.drop(["fnlwgt","marital-status","relationship"], axis=1)

In [42]:
le = preprocessing.LabelEncoder()

In [43]:
le.fit(data["gender"])

LabelEncoder()

In [44]:
list(le.classes_)

['Female', 'Male']

In [45]:
feat_data["gender"] = le.fit_transform(feat_data["gender"])
feat_data["education"] = le.fit_transform(feat_data["education"])
feat_data["workclass"] = le.fit_transform(feat_data["workclass"])
feat_data["occupation"] = le.fit_transform(feat_data["occupation"])
feat_data["race"] = le.fit_transform(feat_data["race"])
feat_data["native-country"] = le.fit_transform(feat_data["native-country"])

In [46]:
feat_data.head()

Unnamed: 0,age,workclass,education,educational-num,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,2,10,16,3,4,1,99999,0,60,38,1
1,17,2,2,8,7,4,1,0,0,15,38,0
2,31,2,9,13,3,4,1,0,0,40,38,1
3,58,5,5,4,13,4,1,0,0,40,38,0
4,25,5,15,10,7,2,1,0,0,40,38,0


In [47]:
plt.figure(figsize=(15,12))
cor_map = feat_data.corr()
sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu')
#plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f52224b4510>

In [48]:
from sklearn.model_selection import train_test_split
X = feat_data.drop(["income_>50K","race","native-country","education","workclass"], axis=1)
y = feat_data["income_>50K"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)

In [49]:
clf = RandomForestClassifier(n_estimators =1000, max_depth=20)

In [51]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [52]:
y_pred = clf.predict(X_test)

In [53]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
confusion_matrix(y_pred,y_test)

array([[8570, 1395],
       [ 624, 1630]])

In [54]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.86      0.89      9965
           1       0.54      0.72      0.62      2254

    accuracy                           0.83     12219
   macro avg       0.74      0.79      0.76     12219
weighted avg       0.86      0.83      0.84     12219



In [63]:
test_data = pd.read_csv("data/test.csv")

In [64]:
test_data["gender"] = le.fit_transform(test_data["gender"])
test_data["education"] = le.fit_transform(test_data["education"])
test_data["workclass"] = le.fit_transform(test_data["workclass"])
test_data["occupation"] = le.fit_transform(test_data["occupation"])
test_data["race"] = le.fit_transform(test_data["race"])
test_data["native-country"] = le.fit_transform(test_data["native-country"])
test_set = test_data.drop(["fnlwgt","marital-status","relationship","race","native-country","education","workclass"], axis=1).values

In [66]:
test_set

array([[39,  9,  2, ...,  0,  0, 40],
       [32, 12,  2, ...,  0,  0, 42],
       [47,  9,  2, ...,  0,  0, 40],
       ...,
       [24, 10,  0, ...,  0,  0, 40],
       [35, 10, 12, ...,  0,  0, 40],
       [37, 14,  9, ...,  0,  0, 45]])

In [67]:
y_pred = clf.predict(test_set)

In [68]:
s = pd.DataFrame(y_pred).reset_index()

In [69]:
s.rename(columns={0:"outcome"})

Unnamed: 0,index,outcome
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
894,894,0
895,895,0
896,896,0
897,897,0


In [70]:
feature_list = list(feat_data.columns)

# Get numerical feature importances
importances = list(clf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: age                  Importance: 0.26
Variable: occupation           Importance: 0.19
Variable: workclass            Importance: 0.17
Variable: gender               Importance: 0.15
Variable: education            Importance: 0.1
Variable: race                 Importance: 0.07
Variable: educational-num      Importance: 0.05


In [72]:
np.savetxt('np.txt', s.values, fmt='%d')
