# Missing Values

Handling missing value is in a right war crucial while building a model. We'll use some strategies, how it's done.


In [2]:
# Import Libraries

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

In [3]:
# loda data
#
import random

df = pd.read_csv("../data/clv_data.csv")

df['lifetime_value'] = df['purchases'] * 20

df.head()

Unnamed: 0.1,Unnamed: 0,id,age,gender,income,days_on_platform,city,purchases,lifetime_value
0,0,0,,Male,126895,14.0,San Francisco,0,0
1,1,1,,Male,161474,14.0,Tokyo,0,0
2,2,2,24.0,Male,104723,34.0,London,1,20
3,3,3,29.0,Male,43791,28.0,London,2,40
4,4,4,18.0,Female,132181,26.0,London,2,40


In [4]:
# Check null values
df.isnull().sum()

Unnamed: 0             0
id                     0
age                 2446
gender                 0
income                 0
days_on_platform     141
city                   0
purchases              0
lifetime_value         0
dtype: int64

In [5]:
# See null values in percentage
def null_values_pergentage(df):

    null_values = pd.DataFrame(df.isnull().sum())
    null_values[1] = null_values[0]/len(df)
    null_values.columns = ['null_count','null_pct']
    return null_values

null_values_pergentage(df)

Unnamed: 0,null_count,null_pct
Unnamed: 0,0,0.0
id,0,0.0
age,2446,0.4892
gender,0,0.0
income,0,0.0
days_on_platform,141,0.0282
city,0,0.0
purchases,0,0.0
lifetime_value,0,0.0


In [6]:
# Drop dull values

drop_df = df.copy()

drop_df = drop_df.dropna()

In [7]:
X_drop = drop_df[['age','days_on_platform','income']]
y_drop = drop_df['lifetime_value']


X_train_drop = X_drop[:4000]
y_train_drop = y_drop[:4000]

X_test_drop = X_drop[1000:]
y_test_drop = y_drop[1000:]

### Mean/Median/Mode Imputation


In [8]:
m_df = df.copy()

X_m = m_df[['age','days_on_platform','income']]
y_m = m_df['lifetime_value']


X_train_m = X_m[:4000]
y_train_m = y_m[:4000]

X_test_m = X_m[1000:]
y_test_m = y_m[1000:]

In [9]:
## Mean
X_train_m.loc[:,'age'] = X_train_m['age'].fillna(np.mean(X_train_m['age']))
X_test_m.loc[:,'age'] = X_test_m['age'].fillna(np.mean(X_train_m['age'])) ## Cannot use training dataset to impute


X_train_m.loc[:,'days_on_platform'] = X_train_m['days_on_platform'].fillna(np.mean(X_train_m['days_on_platform']))
X_test_m.loc[:,'days_on_platform'] = X_test_m['days_on_platform'].fillna(np.mean(X_train_m['days_on_platform'])) ## Cannot use training dataset to impute

In [10]:
## Median
m_df.loc[:,'age'] = df['age'].fillna(np.median(m_df['age']))

In [12]:
## Mode
m_df.loc[:,'age'] = m_df['age'].fillna(stats.mode(m_df['age'])[0])

# Imputation Using Regression


In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer


## Target - Purchases in the first six months

r_df = df.copy()

X_r = r_df[['age','days_on_platform','income']]
y_r = r_df['lifetime_value']


X_train_r = X_r[:4000]
y_train_r = y_r[:4000]

X_test_r = X_r[1000:]
y_test_r = y_r[1000:]


Imp = IterativeImputer(max_iter=10, random_state = 0)
Imp.fit(X_train_r)

X_train_r = Imp.transform(X_train_r)
X_test_r = Imp.transform(X_test_r)

X_train_r = pd.DataFrame(X_train_r)
X_train_r.columns = X_train_r.columns

X_test_r = pd.DataFrame(X_test_r)
X_test_r.columns = X_test_r.columns

r_df = pd.concat([X_train_r,X_test_r],axis = 0)

# Comparison

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Drop Null Model
clf_n = RandomForestRegressor(random_state=0)
clf_n.fit(X_train_drop, y_train_drop)
pred_dropna = clf_n.predict(X_test_drop)

# Mean Imputation Model
clf_m = RandomForestRegressor(random_state=0)
clf_m.fit(X_train_m, y_train_m)
pred_m = clf_m.predict(X_test_m)

# Regression Imputation
clf_r = RandomForestRegressor(random_state=0)
clf_r.fit(X_train_r, y_train_r)
pred_r = clf_r.predict(X_test_r)

In [15]:
#  Let's print and see which way is more effective.

print('Drop Null MAE Score: %.3f' % mean_absolute_error(y_test_drop,pred_dropna))
print('Mean Impute MAE Score: %.3f' % mean_absolute_error(y_test_m,pred_m))
print('Regression MAE Score: %.3f '% mean_absolute_error(y_test_r,pred_r))

Drop Null MAE Score: 7.636
Mean Impute MAE Score: 10.828
Regression MAE Score: 10.795 
