<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Hackathon - Predicting Income

_Authors: Jordan Gates, Joomart Achekeev, Brian Kim_

---

Data descriptions can be found here: https://archive.ics.uci.edu/ml/datasets/adult

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict

In [2]:
df_train = pd.read_csv('./data/large_train_sample.csv')
df_test = pd.read_csv('./data/test_data.csv')

In [3]:
df_train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K


In [4]:
df_test.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Male,0,0,50,United-States


### Data cleaning & Preprocessing

In [5]:
def clean_df(cur_df):
    
    # Remove spaces and snake case column names
    cur_df.columns = [col.strip().lower().replace('-', '_').lower() for col in cur_df.columns]
    
    for column in list(cur_df.columns):
        try:
            # Get rid of extra spaces in the cells
            cur_df[column] = [cell.strip().lower().replace('-', '_') for cell in cur_df[column]]
            
            # Get rid of the ? marks in the data
            cur_df[column] = ['unknown' if val == "?" else val for val in cur_df[column]]
            
        except: # If the column contains ints or floats
            pass
        
    return cur_df

In [6]:
df_train = clean_df(df_train)
df_test = clean_df(df_test)

In [7]:
df_train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage              0
dtype: int64

In [8]:
df_test.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
dtype: int64

In [9]:
# Categorize the wage column: less then 50 thousand is 0, over 50 thousand is 1
df_train['wage'] = [0 if i == "<=50k" else 1 for i in df_train['wage']]
df_train['wage'].value_counts()

0    24720
1     7841
Name: wage, dtype: int64

In [10]:
df_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,sex,capital_gain,capital_loss,hours_per_week,native_country,wage
0,39,state_gov,77516,bachelors,13,never_married,adm_clerical,not_in_family,male,2174,0,40,united_states,0
1,50,self_emp_not_inc,83311,bachelors,13,married_civ_spouse,exec_managerial,husband,male,0,0,13,united_states,0
2,38,private,215646,hs_grad,9,divorced,handlers_cleaners,not_in_family,male,0,0,40,united_states,0
3,53,private,234721,11th,7,married_civ_spouse,handlers_cleaners,husband,male,0,0,40,united_states,0
4,28,private,338409,bachelors,13,married_civ_spouse,prof_specialty,wife,female,0,0,40,cuba,0


In [11]:
df_train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
wage               int64
dtype: object

In [12]:
# Drop unnecesary columns
df_train.drop(columns=['fnlwgt', 'education', 'native_country'], axis=1, inplace=True)
df_test.drop(columns=['fnlwgt', 'education', 'native_country'], axis=1, inplace=True)

### Feature engineering

In [13]:
# Dummify 'workclass', 'marital_status', 'occupation', 'relationship', 'sex'
df_train = pd.get_dummies(df_train, columns=['workclass', 'marital_status', 'occupation', 'relationship', 'sex'])
df_test = pd.get_dummies(df_test, columns=['workclass', 'marital_status', 'occupation', 'relationship', 'sex'])

In [14]:
df_train.shape

(32561, 45)

### Modeling

In [15]:
# Null Model
# If we predict that everyone makes less than $50k per year, we would be right 75% of the time
df_train['wage'].value_counts(normalize= True)

0    0.75919
1    0.24081
Name: wage, dtype: float64

In [16]:
# As we have train data separated from test data in two different databases we will use df_train
# to identify X_train and y_train
X_train = df_train.drop('wage', axis=1)
y_train = df_train['wage']

In [17]:
ss = StandardScaler()
ss.fit(X_train)
Z_train = ss.transform(X_train)

In [18]:
ada = AdaBoostClassifier()
params = {
    'n_estimators': [75, 80, 85, 90, 95],
    'learning_rate': [0.8, 0.9, 1.0]
}
grid = GridSearchCV(ada, param_grid=params)

In [19]:
grid.fit(Z_train, y_train)

GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.8, 0.9, 1.0],
                         'n_estimators': [75, 80, 85, 90, 95]})

In [20]:
grid.best_params_

{'learning_rate': 1.0, 'n_estimators': 90}

In [21]:
grid.best_score_

0.8645005915215496

In [22]:
# Save predictions to y_pred
y_pred = grid.predict(df_test)

In [23]:
y_pred = pd.DataFrame(y_pred, index=df_test.index, columns=['wage'])

In [24]:
# Save predictions to csv
y_pred.to_csv('./data/results/result_final.csv')

In [25]:
# Score the predictions
y_test = pd.read_csv('./data/y_test.csv')

correct = 0
for i in range(len(y_test)):
    if y_test.iloc[i,0] == y_pred.iloc[i,0]:
        correct += 1
correct/len(y_test)

0.8466310423192679