In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

from make_classification import evaluate
%load_ext autoreload
%autoreload 2

# Improving metrics by cleaning data

### Loading data

In [2]:
train_data = pd.read_csv('data/adult_train_data.csv')
test_features = pd.read_csv('data/adult_test_features.csv')
test_features.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States


In [3]:
train_data['is_money_gainer'].value_counts()

0    24720
1     7841
Name: is_money_gainer, dtype: int64

In [4]:
test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             16281 non-null  int64 
 1   workclass       15318 non-null  object
 2   fnlwgt          16281 non-null  int64 
 3   education       16281 non-null  object
 4   education_num   16281 non-null  int64 
 5   marital_status  16281 non-null  object
 6   occupation      15315 non-null  object
 7   relationship    16281 non-null  object
 8   race            16281 non-null  object
 9   gender          16281 non-null  object
 10  capital_gain    16281 non-null  int64 
 11  capital_loss    16281 non-null  int64 
 12  hours_per_week  16281 non-null  int64 
 13  native_country  16007 non-null  object
dtypes: int64(6), object(8)
memory usage: 1.7+ MB


### Cleaning begins here

### Remove the education column

'Cause it is correlated with education_num 1:1

In [5]:
train_data.drop('education', axis=1, inplace=True)
test_features.drop('education', axis=1, inplace=True)

In [6]:
test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             16281 non-null  int64 
 1   workclass       15318 non-null  object
 2   fnlwgt          16281 non-null  int64 
 3   education_num   16281 non-null  int64 
 4   marital_status  16281 non-null  object
 5   occupation      15315 non-null  object
 6   relationship    16281 non-null  object
 7   race            16281 non-null  object
 8   gender          16281 non-null  object
 9   capital_gain    16281 non-null  int64 
 10  capital_loss    16281 non-null  int64 
 11  hours_per_week  16281 non-null  int64 
 12  native_country  16007 non-null  object
dtypes: int64(6), object(7)
memory usage: 1.6+ MB


### Apply one hot encoding

In [7]:
features_to_ohe = ["workclass", "occupation", "marital_status", "relationship", "race", "gender", "native_country"]

ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(train_data[features_to_ohe])


cat_train = pd.DataFrame(ohe.transform(train_data[features_to_ohe]).toarray(), index=train_data.index)
cat_test = pd.DataFrame(ohe.transform(test_features[features_to_ohe]).toarray(), index=test_features.index)

cat_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16278,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16279,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Apply scaling

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   workclass        30725 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education_num    32561 non-null  int64 
 4   marital_status   32561 non-null  object
 5   occupation       30718 non-null  object
 6   relationship     32561 non-null  object
 7   race             32561 non-null  object
 8   gender           32561 non-null  object
 9   capital_gain     32561 non-null  int64 
 10  capital_loss     32561 non-null  int64 
 11  hours_per_week   32561 non-null  int64 
 12  native_country   31978 non-null  object
 13  is_money_gainer  32561 non-null  int64 
dtypes: int64(7), object(7)
memory usage: 3.5+ MB


In [9]:
columns_to_scale = ['age', 'education_num', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']

scaler = MinMaxScaler()
scaler.fit(train_data[columns_to_scale])

scaled_features_train = pd.DataFrame(
    scaler.transform(train_data[columns_to_scale]),
    columns=columns_to_scale,
    index=train_data.index
)

scaled_features_test = pd.DataFrame(
    scaler.transform(test_features[columns_to_scale]),
    columns=columns_to_scale,
    index=test_features.index
)

### Putting it all together

In [10]:
train_X_final = pd.concat([scaled_features_train, cat_train], axis=1)
test_X_final = pd.concat([scaled_features_test, cat_test], axis=1)

### Evaluate

In [22]:
train_X = train_X_final
train_y = train_data.loc[train_X.index, 'is_money_gainer']
test_X = test_X_final
train_y.value_counts()

0    24720
1     7841
Name: is_money_gainer, dtype: int64

In [23]:
train_acc, test_acc, clf = evaluate(train_X, train_y, test_X)

In [24]:
print(f"Train accuracy: {round(100 * train_acc, 3)}%")
print(f"Test accuracy: {round(100 * test_acc, 3)}%")

Train accuracy: 85.738%
Test accuracy: 86.242%
