## import data

In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer,KNNImputer 
import warnings
warnings.filterwarnings('ignore')

## load data

In [2]:
df=pd.read_csv('clv_data.csv')
df['life_time_value']=df['purchases']*20
df.head()

Unnamed: 0.1,Unnamed: 0,id,age,gender,income,days_on_platform,city,purchases,life_time_value
0,0,0,,Male,126895,14.0,San Francisco,0,0
1,1,1,,Male,161474,14.0,Tokyo,0,0
2,2,2,24.0,Male,104723,34.0,London,1,20
3,3,3,29.0,Male,43791,28.0,London,2,40
4,4,4,18.0,Female,132181,26.0,London,2,40


## Checking Null Values

In [3]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [4]:
df.drop(['gender'],axis=1,inplace=True)
df.isnull().sum()

Unnamed: 0             0
id                     0
age                 2446
income                 0
days_on_platform     141
city                   0
purchases              0
life_time_value        0
dtype: int64

In [5]:
def null_summary_table(df1:pd.DataFrame):
    null_values=pd.DataFrame(df1.isnull().sum())
    null_values[1]=null_values[0]/len(df1)
    null_values.columns=['null_v','null_pct']
    return null_values

In [6]:
null_summary_table(df)

Unnamed: 0,null_v,null_pct
Unnamed: 0,0,0.0
id,0,0.0
age,2446,0.4892
income,0,0.0
days_on_platform,141,0.0282
city,0,0.0
purchases,0,0.0
life_time_value,0,0.0


## droped null values

In [7]:
drop_df=df.copy()
drop_df.dropna(inplace=True)
drop_df.head()

Unnamed: 0.1,Unnamed: 0,id,age,income,days_on_platform,city,purchases,life_time_value
2,2,2,24.0,104723,34.0,London,1,20
3,3,3,29.0,43791,28.0,London,2,40
4,4,4,18.0,132181,26.0,London,2,40
5,5,5,23.0,12315,14.0,New York City,0,0
8,8,8,46.0,129157,23.0,New York City,0,0


In [8]:
x_d= drop_df.drop('life_time_value',axis=1)
y_d=drop_df['life_time_value']

x_train_d=x_d[:4000]
y_train_d=y_d[:4000]

x_test_d=x_d[1000:]
y_test_d=y_d[1000:]

## Mean/Median/Mode imputation

In [9]:
m_df=df.copy()
x_m=m_df.drop('life_time_value',axis=1)
y_m=m_df['life_time_value']

x_train_m=x_m[:4000]
y_train_m=y_m[:4000]

x_test_m=x_m[1000:]
y_test_m=y_m[1000:]

In [10]:
x_train_m['age']=x_train_m['age'].fillna(np.mean(x_train_m['age']))
x_test_m['age']=x_test_m['age'].fillna(np.mean(x_train_m['age']))

x_train_m['days_on_platform']=x_train_m['days_on_platform'].fillna(np.mean(x_train_m['days_on_platform']))
x_test_m['days_on_platform']=x_test_m['days_on_platform'].fillna(np.mean(x_train_m['days_on_platform']))

## Multiple Imputation Using Regression

In [11]:
r_df=df.copy()
x_r=r_df.drop('life_time_value',axis=1)
y_r=r_df['life_time_value']

x_train_r=x_r[:4000]
y_train_r=y_r[:4000]

x_test_r=x_r[1000:]
y_test_r=y_r[1000:]

imp=IterativeImputer(max_iter=10,random_state=0).fit(x_train_r)
x_train_r=imp.transform(x_train_r)
x_test_r=imp.transform(x_test_r)

ValueError: could not convert string to float: 'San Francisco'

## Nearest Neighbor Imputation

In [None]:
imputer=KNNImputer(n_neighbors=5)
imputer.fit(x_train_r)
x_train_k=imp.transform(x_train_r)
x_test_k=imp.transform(x_test_r)

y_train_k=y_train_r.copy()
y_test_k=y_test_r.copy()

## Comparison

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error ,mean_absolute_error

In [None]:
clf_n=RandomForestRegressor(random_state=0)
clf_n.fit(x_train_d,y_train_d)
drop_pre=clf_n.predict(x_test_d)

clf_n=RandomForestRegressor(random_state=0)
clf_n.fit(x_train_m,y_train_m)
mean_pre=clf_n.predict(x_test_m)

clf_n=RandomForestRegressor(random_state=0)
clf_n.fit(x_train_r,y_train_r)
reg_pre=clf_n.predict(x_test_r)

clf_n=RandomForestRegressor(random_state=0)
clf_n.fit(x_train_k,y_train_k)
kN_pre=clf_n.predict(x_test_k)



print('Drop Null MAE Score: %.3f' % mean_absolute_error(y_test_d,drop_pre))
print('Mean Impute MAE Score: %.3f' % mean_absolute_error(y_test_m,mean_pre))
print('Regression MAE Score: %.3f '% mean_absolute_error(y_test_r,reg_pre))
print('Nearest Neighbor MAE Score: %.3f'% mean_absolute_error(y_test_k,kN_pre))