# Encoding

In [3]:
import os
import re
import sys
import warnings

import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

warnings.filterwarnings('ignore')

# define functions

In [4]:
def ordinal_encode(df, df_test, testing = False):
#     df = df.copy()
    categories_dict = {}
    
    temp_merge = df.append(df_test)
    for cat in temp_merge.columns:
        if temp_merge[cat].dtypes == 'object':
            categories_dict[cat] = list(temp_merge[cat].unique())
            if testing:
                print("Numero de categorias para variavel '{}': {} ".format(cat,temp_merge[cat].unique().size))

    if testing:
        print()
        print(list(categories_dict.keys()))
        
    enc = OrdinalEncoder(categories=list(categories_dict.values()))
    trained_encoder = enc.fit(df[list(categories_dict.keys())])
    
    # transform train and test
    df[list(categories_dict.keys())] = trained_encoder.transform(df[list(categories_dict.keys())])
    df_test[list(categories_dict.keys())] = trained_encoder.transform(df_test[list(categories_dict.keys())])

    if testing:
        print(categories_dict)
    
    return df, df_test

def one_hot_encode(df):
    print('Quantity of columns before one-hot encoding:', len(df.columns))
    
    df_oldcols = df.columns.to_list()
    df = pd.get_dummies(df, prefix_sep='_', drop_first=True)
    
    print('Quantity of columns after one-hot encoding:', len(df.columns))
    
    # rename columns to show which are dummies
    onehot_cols = list(set(df.columns.to_list()) - set(df_oldcols))
    onehot_cols_renaming = {col: 'dummy_'+col.replace('-', '_') for col in onehot_cols}
    df.rename(columns = onehot_cols_renaming, inplace=True)
    
    return df

# Define paths and capture data

In [5]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '03_processed')
reports = os.path.join('..', 'data', '06_reporting')

ord_dict = {}
ord_dict['X_train'] = pd.read_csv(os.path.join(inputs, 'X_train.csv'), index_col='id')
ord_dict['X_test'] = pd.read_csv(os.path.join(inputs, 'X_test.csv'), index_col='id')

onehot_dict = copy.deepcopy(ord_dict)

y_train = pd.read_csv(os.path.join(inputs, 'y_train.csv'), index_col='id') 
y_test = pd.read_csv(os.path.join(inputs, 'y_test.csv'), index_col='id')

# Count categorical data unique values
Check both train and test. Any inconsistency between them should be addressed.

In [6]:
for data in ['X_train', 'X_test']:
    categories_dict = {}
    print('\r\nchecking number of categories for {}'. format(data))
    for cat in ord_dict[data].columns:
        if ord_dict[data][cat].dtypes == 'object':
            categories_dict[cat] = list(ord_dict[data][cat].unique())
            print("Numero de categorias para variavel '{}': {} ".format(cat, ord_dict[data][cat].unique().size))


checking number of categories for X_train

checking number of categories for X_test


# Ordinal Encoding

In [7]:
ord_dict['X_train'], ord_dict['X_test'] = ordinal_encode(ord_dict['X_train'], ord_dict['X_test'], testing = False)

ord_dict['X_train'].head()

Unnamed: 0_level_0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,if_anomaly
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,1
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,1
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,1
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,1
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,1


# One-Hot Encoding

In [8]:
for df in ['X_train', 'X_test']:
    onehot_dict[df] = one_hot_encode(onehot_dict[df])
    
print('\r\nColumns of the new database:')
# print(onehot_dict[df].columns.to_list())

Quantity of columns before one-hot encoding: 14
Quantity of columns after one-hot encoding: 14
Quantity of columns before one-hot encoding: 14
Quantity of columns after one-hot encoding: 14

Columns of the new database:


# report new data types

### data alignment
if some category is missing on test set, we need to account for that and build corresponding column filled with 'zeros'.

In [9]:
def fill_missing_cols(smaller, greater):
    missing_cols = set( greater.columns ) - set( smaller.columns )
    for c in missing_cols:
        smaller[c] = 0
    
    return smaller

In [10]:
onehot_dict['X_train'] = fill_missing_cols(onehot_dict['X_train'], onehot_dict['X_test'])
onehot_dict['X_test'] = fill_missing_cols(onehot_dict['X_test'], onehot_dict['X_train'])

# align column positions (no data leakage here. Just altering column ordering.)
onehot_dict['X_train'], onehot_dict['X_test'] = onehot_dict['X_train'].align(onehot_dict['X_test'], axis=1)

# Save processed data

In [11]:
for df in ['X_train', 'X_test']:
    ord_dict[df].to_csv(os.path.join(outputs, df+'.csv'))
    onehot_dict[df].to_csv(os.path.join(outputs, df+'_onehot.csv'))
    
for df in ['X_train', 'X_test']:
    print(ord_dict[df].shape)
    print(onehot_dict[df].shape)
    
y_train.to_csv(os.path.join(outputs, 'y_train.csv'))
y_test.to_csv(os.path.join(outputs, 'y_test.csv'))

(354, 14)
(354, 14)
(152, 14)
(152, 14)


# save report over data types