In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import wiseimputer
from wiseimputer.utils import Imputer

In [6]:
train = pd.read_csv(r"C:\Users\ZimaIT\Downloads\trainspaceship.csv")
Sample = train.sample(500,random_state=42).reset_index(drop=True)
Sample.drop(['PassengerId','Name','Cabin'],axis=1,inplace=True)
Sample

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Mars,False,TRAPPIST-1e,19.0,False,417.0,349.0,634.0,3.0,1057.0,True
1,Earth,False,TRAPPIST-1e,18.0,False,4.0,904.0,0.0,0.0,1.0,False
2,Earth,True,TRAPPIST-1e,41.0,False,0.0,0.0,0.0,0.0,0.0,False
3,Earth,False,TRAPPIST-1e,35.0,False,0.0,338.0,436.0,,0.0,True
4,Europa,True,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...
495,Earth,True,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,True
496,Earth,False,TRAPPIST-1e,46.0,False,0.0,4.0,834.0,0.0,32.0,True
497,Europa,False,TRAPPIST-1e,60.0,False,0.0,190.0,3.0,1964.0,30.0,False
498,Earth,True,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,,False


## Correlation Cofficient
When the third argument is less than 1, it will considers as the correlation cofficient. In this case, the imputer considers only those features that their correlation coefficients are bigger than the number (similarity) for filling the missing values of each columns.

$similarity: correlation coeff = 0.1$

In [4]:
final = Imputer(Sample,'Transported',0.1,GradientBoostingRegressor,ExtraTreesClassifier)
final

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Destination,VIP,Transported
0,19.0,417.0,349.0,634.0,3.000000,1057.000000,0.0,0.0,0.0,0.0,True
1,18.0,4.0,904.0,0.0,0.000000,1.000000,1.0,0.0,0.0,0.0,False
2,41.0,0.0,0.0,0.0,0.000000,0.000000,1.0,1.0,0.0,0.0,False
3,35.0,0.0,338.0,436.0,0.061923,0.000000,1.0,0.0,0.0,0.0,True
4,43.0,0.0,0.0,0.0,0.000000,0.000000,2.0,1.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...
495,15.0,0.0,0.0,0.0,0.000000,0.000000,1.0,1.0,0.0,0.0,True
496,46.0,0.0,4.0,834.0,0.000000,32.000000,1.0,0.0,0.0,0.0,True
497,60.0,0.0,190.0,3.0,1964.000000,30.000000,2.0,0.0,0.0,0.0,False
498,0.0,0.0,0.0,0.0,0.000000,0.047693,1.0,1.0,0.0,0.0,False


## N Near Featueres by Mutual Information
When the third argument is integer and bigger than or equal to 1, it will considers as the n near features chosen by their mutual information score to the feature has missing values.

In [7]:
final = Imputer(Sample,'Transported',3,GradientBoostingRegressor,ExtraTreesClassifier)
final

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Destination,VIP,Transported
0,19.0,417.0,349.0,634.0,3.000000,1057.000000,0.0,0.0,0.0,0.0,True
1,18.0,4.0,904.0,0.0,0.000000,1.000000,1.0,0.0,0.0,0.0,False
2,41.0,0.0,0.0,0.0,0.000000,0.000000,1.0,1.0,0.0,0.0,False
3,35.0,0.0,338.0,436.0,78.102883,0.000000,1.0,0.0,0.0,0.0,True
4,43.0,0.0,0.0,0.0,0.000000,0.000000,2.0,1.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...
495,15.0,0.0,0.0,0.0,0.000000,0.000000,1.0,1.0,0.0,0.0,True
496,46.0,0.0,4.0,834.0,0.000000,32.000000,1.0,0.0,0.0,0.0,True
497,60.0,0.0,190.0,3.0,1964.000000,30.000000,2.0,0.0,0.0,0.0,False
498,0.0,0.0,0.0,0.0,0.000000,5.845768,1.0,1.0,0.0,0.0,False
