In [1]:
import json 
import numpy as np
import pandas as pd 
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv',skipinitialspace=True)

x_cols = [c for c in df.columns if c!= 'income']

# set input matrix and target column
X = df[x_cols]
y = df['income']

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

## Pre-processing

In [4]:
# fill missing values
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print(X_train)

       age  workclass  fnlwgt     education  education-num  \
29700   37    Private   34146       HS-grad              9   
1529    37    Private   26898       HS-grad              9   
27477   26    Private  190762  Some-college             10   
31950   35    Private  189092     Bachelors             13   
4732    23    Private  260019       7th-8th              4   
...    ...        ...     ...           ...            ...   
27852   24    Private  223367          11th              7   
23605   20    Private  127185  Some-college             10   
1318    41  State-gov  144928     Bachelors             13   
25299   40    Private  199303       HS-grad              9   
27439   38    Private   38312     Bachelors             13   

           marital-status       occupation relationship   race     sex  \
29700  Married-civ-spouse     Craft-repair      Husband  White    Male   
1529             Divorced  Exec-managerial    Unmarried  White  Female   
27477  Married-civ-spouse        

In [5]:
# convert categoricals
encoders = {}
for column in ['workclass', 'education', 'marital-status',
                'occupation', 'relationship', 'race',
                'sex','native-country']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

## Algorithm training

In [6]:
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [7]:
# train the Extra Trees Algorithms
et = ExtraTreesClassifier(n_estimators=100)
et = et.fit(X_train, y_train)

In [8]:
# Save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']