<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Good Fast Cheap - 

_Authors: Jordan Gates, Joomart Achekeev, Brian Kim

---

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('./data/large_train_sample.csv')
df_test = pd.read_csv('./data/test_data.csv')

In [3]:
df_train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K


In [4]:
df_test.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Male,0,0,50,United-States


### Data cleaning

In [5]:
#changing the column names to underscore
df_train.columns = [col.replace('-', '_').lower() for col in df_train.columns]
df_test.columns = [col.replace('-', '_').lower() for col in df_test.columns]
#changing the column names to get rid of spaces
df_train.columns = [col.replace(' ', '').lower() for col in df_train.columns]
df_test.columns = [col.replace(' ', '').lower() for col in df_test.columns]

In [6]:
#getting rid of the spaces in the cells
for i in list(df_train.columns):
    try:
        df_train[i] = [j.strip().lower().replace('-', '_') for j in df_train[i]]
    except:
        pass

for i in list(df_test.columns):
    try:
        df_test[i] = [j.strip().lower().replace('-', '_') for j in df_test[i]]
    except:
        pass

In [7]:
#getting rid of the ? marks in the data
df_train['occupation'] = ['unknown' if i == "?" else i for i in df_train['occupation']]
df_test['occupation'] = ['unknown' if i == "?" else i for i in df_test['occupation']]

df_train['workclass'] = ['unknown' if i == "?" else i for i in df_train['workclass']]
df_test['workclass'] = ['unknown' if i == "?" else i for i in df_test['workclass']]

In [8]:
# Categorizing the wage less then 50 thousand is 0, and 1 is for over 50 thousand
df_train['wage'] = [0 if i == "<=50k" else 1 for i in df_train['wage']]
df_train['wage'].value_counts()

0    24720
1     7841
Name: wage, dtype: int64

In [9]:
df_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,sex,capital_gain,capital_loss,hours_per_week,native_country,wage
0,39,state_gov,77516,bachelors,13,never_married,adm_clerical,not_in_family,male,2174,0,40,united_states,0
1,50,self_emp_not_inc,83311,bachelors,13,married_civ_spouse,exec_managerial,husband,male,0,0,13,united_states,0
2,38,private,215646,hs_grad,9,divorced,handlers_cleaners,not_in_family,male,0,0,40,united_states,0
3,53,private,234721,11th,7,married_civ_spouse,handlers_cleaners,husband,male,0,0,40,united_states,0
4,28,private,338409,bachelors,13,married_civ_spouse,prof_specialty,wife,female,0,0,40,cuba,0


In [10]:
df_train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
wage               int64
dtype: object

In [11]:
df_train['wage'].value_counts()

0    24720
1     7841
Name: wage, dtype: int64

In [12]:
#Dropping unnecesary columns
df_train.drop(columns=['fnlwgt', 'education', 'native_country'], axis=1, inplace=True)
df_test.drop(columns=['fnlwgt', 'education', 'native_country'], axis=1, inplace=True)

In [13]:
df_test.head(2)

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,sex,capital_gain,capital_loss,hours_per_week
0,25,private,7,never_married,machine_op_inspct,own_child,male,0,0,40
1,38,private,9,married_civ_spouse,farming_fishing,husband,male,0,0,50


In [14]:
df_train['workclass'].value_counts()

private             22696
self_emp_not_inc     2541
local_gov            2093
unknown              1836
state_gov            1298
self_emp_inc         1116
federal_gov           960
without_pay            14
never_worked            7
Name: workclass, dtype: int64

### Feature engineering

In [21]:
#Dummifying 'workclass', 'marital_status', 'occupation', 'relationship', 'sex'
df_train = pd.get_dummies(df_train, columns=['workclass', 'marital_status', 'occupation', 'relationship', 'sex'])
df_test = pd.get_dummies(df_test, columns=['workclass', 'marital_status', 'occupation', 'relationship', 'sex'])

### Modeling

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict

In [24]:
# As we have train data separated from test data in two different databases we will use df_train
# to identify X_train and y_train
X_train = df_train.drop('wage', axis=1)
y_train = df_train['wage']

In [26]:
ss = StandardScaler()
ss.fit(X_train)
Z_train = ss.transform(X_train)

In [74]:
ada = AdaBoostClassifier()
dt = DecisionTreeClassifier()
et = ExtraTreeClassifier()
params = {
    'n_estimators': [65, 70, 75, 80, 85, 90, 95],
}
grid = GridSearchCV(ada, param_grid=params)

In [75]:
grid.fit(Z_train, y_train)

GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'n_estimators': [65, 70, 75, 80, 85, 90, 95]})

In [76]:
grid.best_params_

{'n_estimators': 90}

In [77]:
grid.best_score_

0.8645005915215496

In [78]:
y_pred = grid.predict(df_test)

In [79]:
y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=['wage'])

In [80]:
y_pred.to_csv('./data/result_final.csv')