# 03 - Processing

## Setup

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [10]:
DATASET = "df_orig.pkl"
SCORE_DATASET = "df_score.pkl"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import Data & Features

In [11]:
df_train = pd.read_pickle(ROOT+"data/"+DATASET)
df_score = pd.read_pickle(ROOT+"data/"+SCORE_DATASET)

dfs = [df_train, df_score]

print(df_train.shape)
df_train.head()

(209672, 165)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,Target
0,0.577321,0,-1.528407,0.574591,-0.371609,0,0.769142,1,0,-0.564334,-1.670166,1,0.30643,0.430979,0.331989,0.560323,48,1.064115,0,2.123352,0.331989,3,0.822184,-1.670166,-1.636594,0.342279,-1.105716,0.925453,0.790756,-3.02696,33,-1.315302,0.696806,-0.371609,9,14,-1.105716,-0.986619,0.94646,0.925453,4,0.296706,0,0.553993,3,2.884178,5,0,2.123352,1.721216,-1.509301,-0.050747,-2.072471,1,0.812313,4.148009,0.948791,1.0,0.37167,1.064115,0.646152,0.907293,4,0,-1.038661,0.368816,-2.072471,2.422425,1,0.37167,0,234,0.0,19,0,5,-0.036157,0.815055,0.184478,3,0.548773,0.674836,0,0.315166,-0.842786,-2.302796,-0.099039,0.68244,3,1.32449,2.017993,1.32449,1,0.553993,-0.254312,-4.850562,0.07747,4,-0.242058,0.616705,0.577321,1,-3.975136,0.187446,0.368816,0.961387,9,6,2.024803,1,62,0,0.790756,0.436108,-2.839741,1.752665,0.8825,0.749183,-0.289017,0.187446,2.024803,0.0,0.738336,0,42,2,1,0.696806,1.721216,23,0.57797,-2.237272,39,1,0.0,2.64866,-1.322781,1,1,0.856375,0.674836,0.646152,-3.975136,-1.685895,0.856375,2.884178,-0.842786,-0.050747,-0.099039,-4.850562,1,-1.038661,69,0.749183,0.342279,-1.509301,2.422425,3,33,0.702792,0.548773,0.701321,-0.444765,10,Class_2
1,0.413242,0,-0.072692,0.567419,0.206837,0,-0.164559,0,0,-0.956555,4.030491,1,0.279233,0.512899,0.771122,0.183065,48,1.282373,0,-2.110657,0.771122,3,0.281779,4.030491,4.146307,0.39322,4.386999,0.528352,0.255444,2.83781,33,-7.434869,0.703253,0.206837,12,10,4.386999,-4.222262,0.347965,0.528352,5,-0.98557,0,0.354723,3,2.755122,27,8,-2.110657,4.238453,-0.097172,7.098692,0.803323,1,0.47527,-5.056917,0.328198,1.0,0.423728,1.282373,0.255066,0.493516,9,0,-5.380026,0.636534,0.803323,2.666922,1,0.423728,0,14,0.0,2,0,5,3.308104,0.359323,0.555923,2,1.914999,0.275613,0,0.639203,-0.820514,-3.699545,1.796063,0.795767,3,0.325121,-1.564933,0.325121,1,0.354723,-0.752444,-3.079449,1.073057,2,-1.637676,0.547283,0.413242,0,0.04194,0.495975,0.636534,0.283092,10,12,-1.353885,0,57,0,0.255444,0.367446,-0.256603,-0.603172,0.317201,0.908436,0.347056,0.495975,-1.353885,0.0,0.250319,10,141,2,0,0.703253,4.238453,45,0.365215,2.986686,25,1,0.0,-3.908599,-0.592003,1,1,-0.575467,0.275613,0.255066,0.04194,-3.751033,-0.575467,2.755122,-0.820514,7.098692,1.796063,-3.079449,1,-5.380026,54,0.908436,0.39322,-0.097172,2.666922,42,33,0.661232,1.914999,0.934367,0.524791,5,Class_2
2,-0.992723,0,5.089105,0.344148,1.052436,0,-1.066716,0,0,-0.852081,-5.396598,0,0.349402,0.541646,1.252255,1.696027,48,2.988087,3,-3.721994,1.252255,1,0.298688,-5.396598,0.240343,0.701801,-3.333315,0.331816,0.372463,1.00268,33,-7.104154,0.607222,1.052436,10,14,-3.333315,-4.765734,0.369349,0.331816,7,-2.53848,0,0.680543,3,-0.756906,25,0,-3.721994,1.923408,-1.279555,-1.710899,0.534373,0,0.455945,-2.741101,0.462819,0.0,0.419382,2.988087,0.618782,0.471865,9,5,5.444011,0.304979,0.534373,-2.901062,1,0.419382,0,150,0.0,2,0,5,0.002195,0.736656,0.572382,1,0.277789,0.379108,0,0.468445,-0.690706,-0.935914,-0.295098,0.683867,3,2.455901,1.120227,2.455901,1,0.680543,1.870748,0.884827,0.514067,9,-1.663241,0.478182,-0.992723,0,1.42342,0.821683,0.304979,0.76215,2,6,-0.507181,1,57,0,0.372463,0.332684,-2.889505,3.301401,0.630255,0.573692,-0.919568,0.821683,-0.507181,0.0,0.466936,11,18,1,1,0.607222,1.923408,12,0.320719,0.106202,14,1,1.0,-3.103558,0.369791,1,0,-1.320967,0.379108,0.618782,1.42342,-2.608896,-1.320967,-0.756906,-0.690706,-1.710899,-0.295098,0.884827,1,5.444011,147,0.573692,0.701801,-1.279555,-2.901062,47,33,0.795673,0.277789,0.479354,0.829004,10,Class_1
3,0.37342,0,-4.808461,0.25438,-0.435051,0,1.314414,0,0,-0.450406,-6.630857,0,0.437356,0.803078,0.189292,-0.136253,48,0.887608,0,-0.374439,0.189292,1,0.51168,-6.630857,0.129764,0.946983,0.564304,0.329405,0.383862,-0.636269,33,-3.25519,0.596949,-0.435051,11,14,0.564304,-2.792848,0.221831,0.329405,3,-6.003766,0,0.122344,3,3.161273,5,9,-0.374439,4.803832,0.577315,-5.38227,-0.996981,1,0.202926,-2.469229,0.259328,0.0,0.355687,0.887608,0.634592,0.66319,9,2,1.69319,0.484986,-0.996981,-0.787452,1,0.355687,0,194,0.0,14,0,4,-3.770526,0.149882,0.936328,1,0.146524,0.435222,0,0.366162,-0.383025,-2.509186,-0.480305,0.202493,3,-0.89824,0.267738,-0.89824,1,0.122344,-1.532212,-5.342407,-0.637505,4,2.004994,0.486493,0.37342,0,-0.944536,0.261193,0.484986,0.285201,10,12,0.611588,1,62,0,0.383862,0.401932,-0.083454,2.130718,0.231694,0.299336,1.314217,0.261193,0.611588,,0.301979,8,213,2,0,0.596949,4.803832,49,0.307328,-0.497761,44,2,0.0,0.305652,0.995479,1,0,-1.362534,0.435222,0.634592,-0.944536,0.004034,-1.362534,3.161273,-0.383025,-5.38227,-0.480305,-5.342407,1,1.69319,147,0.299336,0.946983,0.577315,-0.787452,33,33,0.712934,0.146524,0.473989,0.96382,10,Class_2
4,-0.69428,0,-3.944867,0.774119,-1.376659,0,-1.013648,1,0,0.970933,-5.991751,1,0.68871,0.516681,-0.378682,-0.144624,6,1.146451,0,-0.330101,-0.378682,1,0.591218,-5.991751,-1.675724,0.509486,3.870937,0.54053,0.430815,-2.246914,33,-8.806784,0.380222,-1.376659,14,14,3.870937,-2.12218,0.604214,0.54053,4,-0.261959,0,0.736003,3,5.771128,7,0,-0.330101,-2.526599,-3.136469,-3.095688,-0.892518,0,0.522699,-0.713184,0.446525,0.0,0.31392,1.146451,0.71696,0.480881,9,0,-0.740941,0.352894,-0.892518,-0.933163,1,0.31392,8,89,1.0,14,0,4,4.079695,0.359671,0.778319,0,0.595355,0.803919,0,0.891468,2.152445,-9.905907,2.497357,0.619482,1,-1.857226,1.317192,-1.857226,1,0.736003,-1.056155,-0.98809,0.662834,2,2.974653,0.493714,-0.69428,0,-1.135468,0.951759,0.352894,0.891473,9,6,0.028371,0,57,0,0.430815,0.68606,-0.369468,-2.051631,0.678317,0.60645,2.960185,0.951759,0.028371,0.0,0.830762,0,22,2,1,0.380222,-2.526599,12,0.731859,1.19854,11,1,0.0,-3.121829,-1.034903,1,1,-0.346098,0.803919,0.71696,-1.135468,-1.176002,-0.346098,5.771128,2.152445,-3.095688,2.497357,-0.98809,1,-0.740941,163,0.60645,0.509486,-3.136469,-0.933163,57,2,0.722723,0.595355,0.468482,0.998291,5,Class_2


In [12]:
with open(ROOT+"data/features.yaml") as file:
    yml_obj = yaml.load(file, Loader=yaml.FullLoader)

target = yml_obj["target"]
features = yml_obj["features"]

## Processing

### Null Values

In [13]:
features_with_null_vals = set()

for f in features:
    if df_train[f].isnull().sum() > 0:
        features_with_null_vals.add(f)
    if df_score[f].isnull().sum() > 0:
        features_with_null_vals.add(f)

features_with_null_vals = list(features_with_null_vals)
features_with_null_vals.sort()

for f in features_with_null_vals:
    print(f"{f} - ({df_train[f].dtype}) \t null values: (df: {df_train[f].isnull().sum()}\t df_score: {df_score[f].isnull().sum()}   \t total: {df_train[f].isnull().sum() + df_score[f].isnull().sum()})")

if len(features_with_null_vals) < 1:
    print("No features with null values")    

x000 - (float64) 	 null values: (df: 640	 df_score: 294   	 total: 934)
x004 - (float64) 	 null values: (df: 970	 df_score: 384   	 total: 1354)
x010 - (float64) 	 null values: (df: 835	 df_score: 370   	 total: 1205)
x013 - (float64) 	 null values: (df: 376	 df_score: 155   	 total: 531)
x014 - (float64) 	 null values: (df: 439	 df_score: 204   	 total: 643)
x032 - (float64) 	 null values: (df: 320	 df_score: 146   	 total: 466)
x039 - (float64) 	 null values: (df: 676	 df_score: 267   	 total: 943)
x048 - (float64) 	 null values: (df: 858	 df_score: 408   	 total: 1266)
x050 - (float64) 	 null values: (df: 963	 df_score: 416   	 total: 1379)
x057 - (category) 	 null values: (df: 1217	 df_score: 523   	 total: 1740)
x060 - (float64) 	 null values: (df: 1005	 df_score: 457   	 total: 1462)
x072 - (category) 	 null values: (df: 251	 df_score: 113   	 total: 364)
x080 - (float64) 	 null values: (df: 79	 df_score: 31   	 total: 110)
x081 - (float64) 	 null values: (df: 1068	 df_score: 464

In [14]:
df_train[features_with_null_vals].head()

Unnamed: 0,x000,x004,x010,x013,x014,x032,x039,x048,x050,x057,x060,x072,x080,x081,x087,x105,x121,x122,x134,x135,x136
0,0.577321,-0.371609,-1.670166,0.430979,0.331989,0.696806,0.925453,2.123352,-1.509301,1.0,0.646152,0.0,0.548773,0.674836,0.68244,0.961387,0.0,0.738336,0.0,2.64866,-1.322781
1,0.413242,0.206837,4.030491,0.512899,0.771122,0.703253,0.528352,-2.110657,-0.097172,1.0,0.255066,0.0,1.914999,0.275613,0.795767,0.283092,0.0,0.250319,0.0,-3.908599,-0.592003
2,-0.992723,1.052436,-5.396598,0.541646,1.252255,0.607222,0.331816,-3.721994,-1.279555,0.0,0.618782,0.0,0.277789,0.379108,0.683867,0.76215,0.0,0.466936,1.0,-3.103558,0.369791
3,0.37342,-0.435051,-6.630857,0.803078,0.189292,0.596949,0.329405,-0.374439,0.577315,0.0,0.634592,0.0,0.146524,0.435222,0.202493,0.285201,,0.301979,0.0,0.305652,0.995479
4,-0.69428,-1.376659,-5.991751,0.516681,-0.378682,0.380222,0.54053,-0.330101,-3.136469,0.0,0.71696,1.0,0.595355,0.803919,0.619482,0.891473,0.0,0.830762,0.0,-3.121829,-1.034903


#### Replacing Null Values

* I've decided to replace the null values of type `Category` with the most frequent value in the column.
* And the rest of the `Numerical` columns with the mean of the column. (all the remaining numerical columns are `float64` type)

In [15]:
for f in features_with_null_vals:
    for df_ in dfs:
        if df_train[f].dtype == "category":
            df_[f] = df_[f].fillna(df_[f].mode()[0])
        else:
            df_[f] = df_[f].fillna(df_[f].mean())

## Save Data

In [16]:
df_train.to_pickle(ROOT+"data/df_processed.pkl")
df_score.to_pickle(ROOT+"data/df_score_processed.pkl")

# Notes