In [1]:
import pandas as pd
import numpy as np

### Multivariate feature imputation
#### IterativeImputer
https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html

In [2]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
imp = IterativeImputer(max_iter=100, random_state=0)

In [4]:
features = [[4, 2, 1], 
            [24, 12, 6], 
            [8, np.nan, 2], 
            [28, 14, 7], 
            [32, 16, np.nan], 
            [600, 300, 150], 
            [np.nan, 60, 30], 
            [np.nan, np.nan, 1]]

In [5]:
imp.fit(features)

In [6]:
imp.transform(features)

array([[  4.        ,   2.        ,   1.        ],
       [ 24.        ,  12.        ,   6.        ],
       [  8.        ,   3.99966002,   2.        ],
       [ 28.        ,  14.        ,   7.        ],
       [ 32.        ,  16.        ,   7.92735309],
       [600.        , 300.        , 150.        ],
       [120.00314828,  60.        ,  30.        ],
       [  5.58961604,   2.79614869,   1.        ]])

#### By fitting the model it understand that each value in a row is half of the previous value so now if we will give some 2D array which contains NaN values, it is fill the values according to the pattern it learned
We can see that for two NaN values it is not as accurate as for one NaN value

In [7]:
X_test = [[np.nan, 24, 12], 
          [36, np.nan, np.nan], 
          [100, np.nan, 25], 
          [np.nan, 6, 3],
          [np.nan, 8, np.nan]]

In [8]:
imp.transform(X_test)

array([[ 48.00364638,  24.        ,  12.        ],
       [ 36.        ,  17.99997418,   8.92708811],
       [100.        ,  49.9996788 ,  25.        ],
       [ 12.00389542,   6.        ,   3.        ],
       [ 16.12053702,   8.        ,   5.86176342]])

In [10]:
diabetes = pd.read_csv('../datasets/diabetes_processed_incomplete.csv')

diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,32.0,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,32.0,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,32.0,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,32.0,,32.0,0.232,54,1


In [11]:
diabetes.shape

(768, 9)

In [12]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   394 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 54.1 KB


In [13]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [14]:
diabetes_features = diabetes.drop('Outcome', axis=1)
diabetes_label = diabetes[['Outcome']]

diabetes_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,,33.6,0.627,50
1,1,85.0,66.0,29.0,,26.6,0.351,31
2,8,183.0,64.0,32.0,,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [15]:
imp = IterativeImputer(max_iter=10000, random_state=0)
imp.fit(diabetes_features)

In [16]:
diabetes_features_arr = imp.transform(diabetes_features)
diabetes_features_arr.shape

(768, 8)

In [17]:
diabetes_features = pd.DataFrame(diabetes_features_arr, columns=diabetes_features.columns)

diabetes_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [18]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [None]:
#diabetes.to_csv('datasets/diabetes_processed.csv', index=False)

!ls datasets