In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, r2_score, mean_squared_error, explained_variance_score
from sklearn.ensemble import RandomForestClassifier
from warnings import simplefilter
# Optionally turn off warnings once the models are producing good results
#   non convergence may mean the result is not perfect, but it might be good enough
# Credit to Jamie Shaffer
simplefilter("ignore", category=0)

In [2]:
train = pd.read_csv('./data/large_train_sample.csv')
test = pd.read_csv('./data/test_data.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   sex             32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  32561 non-null  object
 13  wage            32561 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


In [4]:
train['marital-status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64

In [5]:
def proc(df):
    for column in df.columns:
        if df[column].dtype == 'O':    # strip spaces from beginning and end of columns with 'object' dtype
            df[column] = df[column].str.strip()
    df = pd.get_dummies(df, columns=['workclass'])
    df['workclass-govt'] = df[['workclass_Federal-gov', 'workclass_Local-gov', 'workclass_State-gov']].sum(1)    # group government work and binarize
    df.drop(columns=['workclass_Federal-gov', 'workclass_Local-gov', 'workclass_State-gov'], inplace=True)
#     df['govt'] = np.where((df.workclass == 'State-gov') | (df.workclass == 'Federal-gov') | (df.workclass == 'Local-gov'), 1, 0)    # group government work and binarize
#     df['prvt'] = np.where((df.workclass == 'Self-emp-not-inc') | (df.workclass == 'Private') | (df.workclass == 'Self-emp-inc'), 1, 0)    # group private work and binarize
#     df['other'] = np.where((df.workclass == 'Without-pay') | (df.workclass == 'Never-worked') | (df.workclass == '?'), 1, 0)    # group other work and binarize
    df['married'] = np.where((df['marital-status'] == 'Married-AF-spouse') | (df['marital-status'] == 'Married-civ-spouse'), 1, 0)    # group other work and binarize
    df['single'] = np.where((df['marital-status'] == 'Never-married') | (df['marital-status'] == 'Widowed') | (df['marital-status'] == 'Divorced'), 1, 0)    # group other work and binarize
    df['separated'] = np.where((df['marital-status'] == 'Married-spouse-absent') | (df['marital-status'] == 'Separated'), 1, 0)    # group other work and binarize
    df['US'] = np.where(df['native-country'] == 'United-States', 1, 0)    # group other work and binarize
    df['male'] = np.where(df['sex'] == 'Male', 1, 0)
    df['fnlwgt^2'] = df.fnlwgt ** 2
    df['hours-per-week^2'] = df['hours-per-week'] **2
    
    return df


In [6]:
train = proc(train)
test = proc(test)

In [7]:
# binarize wage column
train['wage'] = np.where(train['wage'] == '>50K', 0, 1)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   age                         32561 non-null  int64 
 1   fnlwgt                      32561 non-null  int64 
 2   education                   32561 non-null  object
 3   education-num               32561 non-null  int64 
 4   marital-status              32561 non-null  object
 5   occupation                  32561 non-null  object
 6   relationship                32561 non-null  object
 7   sex                         32561 non-null  object
 8   capital-gain                32561 non-null  int64 
 9   capital-loss                32561 non-null  int64 
 10  hours-per-week              32561 non-null  int64 
 11  native-country              32561 non-null  object
 12  wage                        32561 non-null  int64 
 13  workclass_?                 32561 non-null  ui

In [9]:
features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_?', 'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc',
            'workclass_Self-emp-not-inc', 'workclass_Without-pay', 'workclass-govt', 'married', 'single', 'separated', 'US', 'male', 'fnlwgt^2', 'hours-per-week^2']

In [10]:
# X = train.drop(columns=['wage'])
X = train[features]
y = train['wage']

In [11]:
y.value_counts(normalize=True)

1    0.75919
0    0.24081
Name: wage, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [15]:
sc = StandardScaler()

In [16]:
Z_train = sc.fit_transform(X)

In [17]:
rf = RandomForestClassifier(n_jobs=-1)
params = {
    'n_estimators':[110,115,120],
    'max_features':[None, 'auto'],
    'max_depth':[5,10,15]
}

gs = GridSearchCV(rf, param_grid=params, cv=5)
gs.fit(Z_train,y)

print(gs.best_score_)
gs.best_params_

0.8605389061227384


{'max_depth': 10, 'max_features': None, 'n_estimators': 110}

In [18]:
pred = gs.predict(test[features])

In [19]:
pred = pd.DataFrame(pred, columns=['wage'])

In [20]:
pred

Unnamed: 0,wage
0,1
1,0
2,0
3,0
4,1
...,...
16276,1
16277,1
16278,0
16279,0


In [21]:
pred.to_csv('./data/submission.csv', index=False)