In [3]:
# Script to train machine learning model.
import pandas as pd
import numpy as np
import joblib
import os
from scipy.fft import rfft
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

if not os.path.exists('../encodings'):
    os.mkdir('../encodings')

# Add the necessary imports for the starter code.

# Add code to load in the data.

# Proces the test data with the process_data function.

def fit_encoders(data, cats, label):
    for cat in cats + [label]:
            encoder = LabelEncoder()
            encoder.fit(data[cat])
            np.save(f'../encodings/{cat}.npy', encoder.classes_)


def process_data(data, categorical_features, label, training=True):
    encoder_dict = dict()
    for cat in categorical_features + [label]:
        encoder = LabelEncoder()
        encoder.classes_ = np.load(f'../encodings/{cat}.npy', allow_pickle=True)
        data[cat] = encoder.transform(data[cat])
        encoder_dict[cat] = encoder

    y_train = data.pop(label)
    X_train = data

    return X_train, y_train, encoder_dict, label
    

cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

def log_slice_performance(data, cats, model):
    with open('../performance_logs/slice_acc.txt', 'w') as outfile:
        for cat in cats:
            outfile.writelines(f'{cat}\n')
            outfile.writelines('-'*20 + '\n')
            class_values = data[cat].value_counts()[:2]
            for val in class_values:
                slice = data[data[cat]==val]
                if len(slice)>1:
                    print(slice)
                    X, y, *_ = process_data(slice, cat_features, label='salary', training=False)
                    acc = accuracy_score(model.predict(X), y)
                    outfile.writelines(f'-> {val}: {100*acc:.} %\n')

In [16]:
data = pd.read_csv('../data/census_clean.csv')

fit_encoders(data, cat_features, 'salary')

train, test = train_test_split(data, test_size=0.33)

X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True
)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
joblib.dump(rf, '../model/random_forest.joblib')

X_test, y_test, encoder, lb = process_data(test.copy(), categorical_features=cat_features,
    label='salary', training=False)

preds = rf.predict(X_test)
print(f'ACCURACY: {100*accuracy_score(y_test, preds):.2f} %')

#log_slice_performance(test, cat_features, rf)


ACCURACY: 85.80 %


In [17]:
test

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
13037,13037,50,4,283281,5,4,2,7,0,2,1,0,0,40,39
12574,12574,56,4,219762,11,9,2,12,5,4,0,0,0,35,39
27708,27708,48,4,278039,5,4,2,7,0,4,1,0,0,40,39
3702,3702,26,4,290286,11,9,4,3,1,2,1,0,0,40,39
15560,15560,53,4,268545,11,9,2,3,0,2,1,0,0,40,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30781,30781,66,0,186061,15,10,6,0,4,2,0,0,4356,40,39
21099,21099,22,1,146538,11,9,4,1,3,4,1,0,0,40,39
14365,14365,19,4,188008,15,10,4,12,3,2,0,0,0,20,39
488,488,45,2,164427,9,13,0,10,4,4,0,0,0,40,39


In [13]:
data = pd.read_csv('../data/census_clean.csv')


In [15]:
test

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5151,5151,51,6,218311,15,10,0,12,4,4,0,0,0,50,39
9362,9362,27,2,273929,11,9,4,5,3,4,1,0,0,40,39
30061,30061,33,0,393376,1,7,4,0,1,4,0,0,0,40,39
16366,16366,42,4,190767,8,11,0,13,4,4,0,0,0,40,39
18738,18738,58,6,140729,9,13,2,5,0,4,1,0,0,35,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,17729,38,4,137707,8,11,2,13,5,4,0,0,0,40,39
24972,24972,41,5,32016,11,9,2,12,0,4,1,0,0,62,39
448,448,20,4,146538,11,9,2,7,0,4,1,0,0,40,39
25628,25628,53,5,188067,15,10,6,12,4,4,0,0,0,40,39


In [10]:
def log_slice_performance(data, cats, model):
    with open('../performance_logs/slice_acc.txt', 'w') as outfile:
        for cat in cats:
            outfile.writelines(f'{cat}\n')
            outfile.writelines('-'*20 + '\n')
            class_values = data[cat].value_counts()[:2]
            for val in class_values:
                slice = data[data[cat]==val]
                return data, slice, cat, val
                if len(slice)>1:
                    print(slice)
                    X, y, *_ = process_data(slice, cat_features, label='salary', training=False)
                    acc = accuracy_score(model.predict(X), y)
                    outfile.writelines(f'-> {val}: {100*acc:.} %\n')

In [11]:
data, slice, cat, val = log_slice_performance(test, cat_features, rf)

In [12]:
data

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5151,5151,51,6,218311,15,10,0,12,4,4,0,0,0,50,39
9362,9362,27,2,273929,11,9,4,5,3,4,1,0,0,40,39
30061,30061,33,0,393376,1,7,4,0,1,4,0,0,0,40,39
16366,16366,42,4,190767,8,11,0,13,4,4,0,0,0,40,39
18738,18738,58,6,140729,9,13,2,5,0,4,1,0,0,35,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,17729,38,4,137707,8,11,2,13,5,4,0,0,0,40,39
24972,24972,41,5,32016,11,9,2,12,0,4,1,0,0,62,39
448,448,20,4,146538,11,9,2,7,0,4,1,0,0,40,39
25628,25628,53,5,188067,15,10,6,12,4,4,0,0,0,40,39
