# Nearest Neighbour Imputation
#### Uses of the NNI technique in order to fill out the NaNs in the dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer
from sklearn.impute import IterativeImputer

### 1. Import the Dataset

In [None]:
# read csv datafile into pandas table
df = pd.read_csv("maps-synthetic-data-v1.1.csv")
pd.options.display.max_columns = None
df.head()
#display(df)

### 2. Exploring the Missing Data in the Dataset

In [None]:
df.isnull().sum()

In [79]:
na_percentage = round(df.isnull().sum()/len(df) * 100, 2)
print(na_percentage)

na_pctlist = list(na_percentage)
na_col = list(df.columns)

# printing the whole column
for i in range(len(na_pctlist)):
    print(na_col[i] + ": " + str(na_pctlist[i]) + " %")

Unnamed: 0      0.00
X               0.00
flag            0.00
comp_bed_9     40.48
mat_dep        18.85
               ...  
fam_tv_eve     18.64
fam_tv_aft     19.63
fam_tv_mor     19.21
sex             0.00
birth_order     0.00
Length: 85, dtype: float64
Unnamed: 0: 0.0 %
X: 0.0 %
flag: 0.0 %
comp_bed_9: 40.48 %
mat_dep: 18.85 %
mat_age: 2.97 %
weight_16: 61.04 %
height_16: 60.94 %
iq: 46.93 %
comp_noint_bed_16: 96.57 %
comp_int_bed_16: 81.29 %
talk_phon_wend: 64.69 %
text_wend: 64.75 %
talk_mob_wend: 64.8 %
comp_wend: 64.77 %
musi_wend: 64.82 %
read_wend: 64.8 %
work_wend: 64.81 %
alon_wend: 64.69 %
draw_wend: 64.74 %
play_wend: 64.69 %
tv_wend: 64.66 %
out_win_wend: 64.69 %
out_sum_wend: 64.89 %
tran_wend: 64.61 %
talk_phon_week: 63.6 %
text_week: 63.56 %
talk_mob_week: 63.48 %
comp_week: 63.51 %
musi_week: 63.57 %
read_week: 63.64 %
work_week: 63.51 %
alon_week: 63.54 %
draw_week: 63.51 %
play_week: 63.48 %
tv_week: 63.45 %
out_win_week: 63.54 %
out_sum_week: 63.49 %
tran_week: 6

#### K-Nearest Neighbour Imputation

In [111]:
# Choosing only the column which are numerical for testing purposes.
numr_data = [col for col in df.columns if df[col].dtypes != 'O']
df[numr_data].head()
df[numr_data].columns

Index(['Unnamed: 0', 'X', 'mat_dep', 'weight_16', 'height_16', 'iq',
       'agg_score', 'parity', 'secd_diag', 'prim_diag', 'panic_score',
       'dep_thoughts', 'dep_score'],
      dtype='object')

In [115]:
knn = KNNImputer(n_neighbors = 5, add_indicator = False)
knn.fit(df[numr_data])
knn.transform(df[numr_data])

knnpd1 = pd.DataFrame(knn.transform(df[numr_data]), columns=df[numr_data].columns)
knnpd1.head(20)

Unnamed: 0.1,Unnamed: 0,X,mat_dep,weight_16,height_16,iq,agg_score,parity,secd_diag,prim_diag,panic_score,dep_thoughts,dep_score
0,1.0,1.0,3.0,59.294132,181.602831,107.0,11.0,0.0,0.0,0.0,0.0,2.8,0.0
1,2.0,2.0,9.0,68.600018,174.780866,85.0,9.0,1.0,0.4,3.0,0.0,2.8,0.4
2,3.0,3.0,3.0,64.194092,175.856054,107.6,8.0,1.0,0.0,1.0,0.0,2.8,0.0
3,4.0,4.0,5.8,49.812426,160.224186,102.6,9.6,0.0,0.4,3.0,0.0,2.8,0.4
4,5.0,5.0,8.0,62.27003,191.703227,132.0,9.0,1.0,0.0,1.0,0.0,2.8,0.0
5,6.0,6.0,10.0,78.936613,169.722373,106.0,11.0,3.0,0.0,0.0,0.0,2.8,0.0
6,7.0,7.0,0.0,66.918576,180.412111,108.4,13.0,2.0,0.0,1.0,0.0,2.8,0.0
7,8.0,8.0,6.0,84.299663,186.327284,98.0,9.6,0.0,0.0,0.0,0.0,2.8,0.0
8,9.0,9.0,4.0,70.657258,176.027653,108.0,10.0,0.0,0.0,5.0,0.0,2.8,0.0
9,10.0,10.0,0.0,70.529403,187.778207,113.0,13.0,0.0,0.0,1.0,0.0,2.8,0.0


In [120]:
knn2 = KNNImputer(n_neighbors=3, add_indicator = False)
knn2.fit(df[numr_data])
knn2.transform(df[numr_data])

knnpd2 = pd.DataFrame(knn2.transform(df[numr_data]), columns=df[numr_data].columns)
knnpd2.head(20)

Unnamed: 0.1,Unnamed: 0,X,mat_dep,weight_16,height_16,iq,agg_score,parity,secd_diag,prim_diag,panic_score,dep_thoughts,dep_score
0,1.0,1.0,3.0,59.294132,181.602831,107.0,11.0,0.0,0.0,0.0,0.0,2.666667,0.0
1,2.0,2.0,9.0,71.016234,172.091281,85.0,9.0,1.0,0.0,0.0,0.0,2.666667,0.0
2,3.0,3.0,3.0,57.125529,177.843415,108.0,8.0,1.0,0.0,1.666667,0.0,2.666667,0.0
3,4.0,4.0,4.0,49.812426,160.224186,102.666667,10.0,0.0,0.0,1.666667,0.0,2.0,0.0
4,5.0,5.0,8.0,62.27003,191.703227,132.0,9.0,1.0,0.0,1.666667,0.0,2.333333,0.0
5,6.0,6.0,10.0,78.936613,169.722373,106.0,11.0,3.0,0.0,0.0,0.0,2.666667,0.0
6,7.0,7.0,0.0,68.213831,178.109893,106.333333,13.0,2.0,0.0,1.666667,0.0,2.666667,0.0
7,8.0,8.0,3.0,84.299663,186.327284,98.0,10.333333,0.0,0.0,0.0,0.0,2.666667,0.0
8,9.0,9.0,4.0,70.657258,176.027653,108.0,10.0,0.0,0.0,5.0,0.0,2.666667,0.0
9,10.0,10.0,0.0,70.529403,187.778207,113.0,13.0,0.0,0.0,1.666667,0.0,2.666667,0.0


#### Iterative Imputation