In [1]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects

## First Project: Adult data:

In [21]:
# Get the data:
main = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv',skipinitialspace=True)

# Split the data: 
y = main.pop('income')
x = main


In [22]:
# Split the data to training and testing:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)

## Data Preprocessing:

In [23]:
## handle missing values: 
train_mode = dict(x_train.mode().iloc[0])
x_train = x_train.fillna(train_mode)
train_mode

{'age': 34,
 'workclass': 'Private',
 'fnlwgt': 113364,
 'education': 'HS-grad',
 'education-num': 9,
 'marital-status': 'Married-civ-spouse',
 'occupation': 'Prof-specialty',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'capital-gain': 0,
 'capital-loss': 0,
 'hours-per-week': 40,
 'native-country': 'United-States'}

In [24]:
x_train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [25]:
# Categorical variables handling: 
encoders= {}

categorical_cols = ['workclass', 'education','marital-status', 'occupation', 
                    'relationship', 'race', 'sex','native-country']

for col in categorical_cols:
    cat_encoder = LabelEncoder()
    x_train[col] = cat_encoder.fit_transform(x_train[col])
    encoders[col] = cat_encoder
encoders

{'workclass': LabelEncoder(),
 'education': LabelEncoder(),
 'marital-status': LabelEncoder(),
 'occupation': LabelEncoder(),
 'relationship': LabelEncoder(),
 'race': LabelEncoder(),
 'sex': LabelEncoder(),
 'native-country': LabelEncoder()}

In [26]:
x_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
9316,47,3,161558,0,6,3,13,1,2,1,0,0,45,38
7731,37,3,210945,11,9,2,5,0,4,1,0,0,24,38
14671,37,3,186434,11,9,2,7,0,2,1,0,0,40,38
30736,39,3,176335,12,14,2,3,0,4,1,7688,0,65,38
848,39,3,138192,9,13,2,2,0,4,1,0,0,40,38


In [27]:
# train random forest: 
rf = RandomForestClassifier(n_estimators=100) 
rf.fit(x_train,y_train)

RandomForestClassifier()

In [28]:
# train extra tree classifier: 
et = ExtraTreesClassifier(n_estimators=100)
et.fit(x_train,y_train)

ExtraTreesClassifier()

In [29]:
# Save our work:
joblib.dump(train_mode,"./train_mode.joblib",compress=True)
joblib.dump(encoders,"./encoders.joblib",compress=True)
joblib.dump(rf,"./RandomForest.joblib",compress=True)
joblib.dump(et,"./ExtraTrees.joblib",compress=True)


['./ExtraTrees.joblib']