# Feature engineering on raw data

This notebook is the first of feature engineering. Thanks to [previous analysis](../03_analyse/01_analyse_raw_data.html), I now have some insight about what to do on the data.

## Load packages

In [1]:
import pandas as pd
import numpy as np
from pickle import dump, load

In [3]:
PROJECT_PATH = '..'

## Load data

In [132]:
df_details = pd.read_csv(f'{PROJECT_PATH}/01_collect/columns_informations.csv')

dataset_detail = df_details[df_details['dataset'] == 'adult']

dtypes, columns = dataset_detail['dtype'], dataset_detail['column']
dtypes = pd.Series(dtypes.values,index=columns)

parse_dates = list()
for var, dtype in dtypes.iteritems():
    if 'datetime' in dtype:
        parse_dates.append(var)

dtypes = dtypes.str.replace(r'datetime.*','str')

adult = pd.read_csv(f'{PROJECT_PATH}/_data/adult.csv',
                           dtype=dtypes.to_dict(),
                           parse_dates=parse_dates
                          )

columns = np.where(dataset_detail['new_name'].isna(), columns, dataset_detail['new_name'])
adult.columns = columns

In [133]:
adult_orig = adult.copy()

In [134]:
target = 'income'
y = adult[target]
adult = adult.drop(columns=target)

In [135]:
adult.head()

Unnamed: 0,age,workclass,final_weight,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States


## Handling missing values

even though we don't have any missing values I prefer to be cautious and handle use cases when it can have some missing values

In [136]:
from time import time

def get_missing_values(df):
    """
    """
    missing_val = {}
    
    vars_for_most_freq = df.select_dtypes(['object','category','datetime','timedelta','datetimetz']).columns
    for var in vars_for_most_freq:
        missing_val[var] = df[var].value_counts().head(1).index[0]
    
    num_vars = df.select_dtypes('number').columns
    for var in num_vars:
        missing_val[var] = df[var].median()
        
    return missing_val

missing_val = get_missing_values(df=adult)
pd.Series(missing_val).to_json('missing_values.json')

In [137]:
def handle_missing_values(df, missing_values):
    """
    """
    missing_val_cols = df.columns[df.isna().sum() > 0]
    for col in missing_val_cols:
        df[col].fillna(missing_values[col])
        
    return df

## Remove useless columns

From previous analysis I find out that only the `final_weight` columns is useless.

In [138]:
cols_to_drop = ['final_weight']
pd.DataFrame(cols_to_drop, columns=['columns']).to_csv('cols_to_drop.csv',index=False)

In [139]:
def drop_columns(df, columns=[]):
    """
    """
    for col in columns:
        if col in df.columns:
            df = df.drop(columns=[col])
    return df

adult = drop_columns(df=adult, columns=['final_weight'])

## MinMaxScaler for numerical variables

In [140]:
from sklearn.preprocessing import MinMaxScaler

def generate_minmaxscaler(df):
    """
    """
    num_df = df.select_dtypes('number')
    
    scaler = MinMaxScaler()
    scaler.fit(num_df)
    return scaler
    
scaler = generate_minmaxscaler(adult)
dump(scaler, open('scaler.pkl', 'wb')) 

## Create dummies values for categorical variables

In [141]:
def create_dummies_values(df):
    """
    """
    cat_vars = df.select_dtypes(['object','category'])
    
    new_df = df.copy()
    new_df.drop(columns=cat_vars, inplace=True)
    
    ignore_cat_vars = list()
    for var in cat_vars:
        if df[var].nunique() > 100:
            ignore_cat_vars.append(var)
    cat_vars = [v for v in cat_vars if v not in ignore_cat_vars]

    if len(ignore_cat_vars) > 0:
        print('Ignored categorical variables because there are more than 100 values :', ', '.join(ignore_cat_vars))
        print('If there are really usefull, create a specific function for them.')
    
    for var in cat_vars:
        dum_val = pd.get_dummies(df[var], prefix=var, prefix_sep='=')
        new_df = pd.concat([new_df, dum_val], axis=1)
    
    return new_df
        
adult = create_dummies_values(df=adult)

In [142]:
pd.Series(adult.columns).to_frame().to_csv('train_columns.csv', index=False)

## Pipeline & save meta data

In [149]:
data = adult_orig.copy()

missing_values = pd.read_json('missing_values.json', orient='index', typ='series')
scaler = load(open('scaler.pkl', 'rb')) 
cols_to_drop = pd.read_csv('cols_to_drop.csv')['columns'].values
train_cols = pd.read_csv('train_columns.csv')['0'].values

def feature_engineering_pipeline(df):
    """
    """
    df = handle_missing_values(df=df, missing_values=missing_values)
    df = drop_columns(df=df, columns=cols_to_drop)
    
    num_df = df.select_dtypes('number')
    df[num_df.columns] = scaler.transform(num_df)
    
    df = create_dummies_values(df=df)
    df.reindex(columns = train_cols, fill_value=0)
    
    df = df[train_cols]
    
    return df

In [160]:
target = 'income'
data = adult_orig.copy()
y = data[target]
data = data.drop(columns=target)

format_data = feature_engineering_pipeline(df=data)
format_data[target] = y

In [161]:
format_data.head()

Unnamed: 0,age,educational-num,capital-gain,capital-loss,hours-per-week,workclass=?,workclass=Federal-gov,workclass=Local-gov,workclass=Never-worked,workclass=Private,...,native-country=Puerto-Rico,native-country=Scotland,native-country=South,native-country=Taiwan,native-country=Thailand,native-country=Trinadad&Tobago,native-country=United-States,native-country=Vietnam,native-country=Yugoslavia,income
0,0.109589,0.4,0.0,0.0,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,<=50K
1,0.287671,0.533333,0.0,0.0,0.5,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,<=50K
2,0.150685,0.733333,0.0,0.0,0.397959,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,>50K
3,0.369863,0.6,0.076881,0.0,0.397959,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,>50K
4,0.013699,0.6,0.0,0.0,0.295918,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,<=50K


In [169]:
format_data.shape

(48842, 108)

In [168]:
format_data.to_csv(f'{PROJECT_PATH}/_data/adult_prep.csv', index=False)

In [167]:
for col in format_data.columns.values:
    print(f"'{col}':'category',")

'age':'category',
'educational-num':'category',
'capital-gain':'category',
'capital-loss':'category',
'hours-per-week':'category',
'workclass=?':'category',
'workclass=Federal-gov':'category',
'workclass=Local-gov':'category',
'workclass=Never-worked':'category',
'workclass=Private':'category',
'workclass=Self-emp-inc':'category',
'workclass=Self-emp-not-inc':'category',
'workclass=State-gov':'category',
'workclass=Without-pay':'category',
'education=10th':'category',
'education=11th':'category',
'education=12th':'category',
'education=1st-4th':'category',
'education=5th-6th':'category',
'education=7th-8th':'category',
'education=9th':'category',
'education=Assoc-acdm':'category',
'education=Assoc-voc':'category',
'education=Bachelors':'category',
'education=Doctorate':'category',
'education=HS-grad':'category',
'education=Masters':'category',
'education=Preschool':'category',
'education=Prof-school':'category',
'education=Some-college':'category',
'marital-status=Divorced':'category',

In [154]:
# num_vars = ['age','educational-num','capital-gain','capital-loss','hours-per-week']
# pd.DataFrame(scaler.inverse_transform(format_data[num_vars]))

## End