# Mice 

### MICE stands for Multivariate Imputation By Chained Equations algorithm, a technique by which we can effortlessly impute missing values in a dataset by looking at data from other columns and trying to estimate the best prediction for each missing value. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


In [2]:
data = {
    
    "age":[25,27,29,31,33,35],
    "experience":[1,3,5,7,9,11],
    "salary(K)":[50,70,80,90,100,130],
    "loan":[1,1,0,0,1,0]
    
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,age,experience,salary(K),loan
0,25,1,50,1
1,27,3,70,1
2,29,5,80,0
3,31,7,90,0
4,33,9,100,1
5,35,11,130,0


In [4]:
df = df.drop(columns=["loan"])
df

Unnamed: 0,age,experience,salary(K)
0,25,1,50
1,27,3,70
2,29,5,80
3,31,7,90
4,33,9,100
5,35,11,130


## Remove values

In [5]:
df.age.loc[5] = np.nan
df.experience.loc[0] = np.nan
df["salary(K)"].loc[1] = np.nan

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,age,experience,salary(K)
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,,11.0,130.0


## Step 1
### Impute all missing values using mean imputation with the mean of their respective columns.

In [6]:
df_mean = df.copy() 
df_mean.fillna((df_mean.mean()),inplace=True)
df_mean = df_mean.astype(int)
df_mean

Unnamed: 0,age,experience,salary(K)
0,25,7,50
1,27,3,90
2,29,5,80
3,31,7,90
4,33,9,100
5,29,11,130


# Step 2
## Replace again nan value 

## Column 1 (Age)

In [7]:
c1 = df_mean.copy()
c1.age.loc[5] = np.nan

In [8]:
c1

Unnamed: 0,age,experience,salary(K)
0,25.0,7,50
1,27.0,3,90
2,29.0,5,80
3,31.0,7,90
4,33.0,9,100
5,,11,130


## Seperate null row

In [9]:
test = c1.dropna().copy()
test

Unnamed: 0,age,experience,salary(K)
0,25.0,7,50
1,27.0,3,90
2,29.0,5,80
3,31.0,7,90
4,33.0,9,100


# Seprate data between x and y 

In [10]:
y = test.age
x = test.drop(columns=["age"]).values

In [11]:
x

array([[  7,  50],
       [  3,  90],
       [  5,  80],
       [  7,  90],
       [  9, 100]])

In [12]:
y

0    25.0
1    27.0
2    29.0
3    31.0
4    33.0
Name: age, dtype: float64

# Now Train Data 

In [13]:
clf = LinearRegression()
clf.fit(x,y)

# predict the value of age

In [14]:
pred_value = c1.loc[5].dropna()
pred_value = [pred_value]
pred_value

[experience     11.0
 salary(K)     130.0
 Name: 5, dtype: float64]

In [15]:
pred_age = clf.predict(pred_value)
pred_age

array([38.75])

# Update age value

In [16]:
df.age.loc[5] = pred_age[0]
df

Unnamed: 0,age,experience,salary(K)
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,38.75,11.0,130.0


# We will be imputing the columns from left to right.

# Now same thing others column 1 by 1

# Column 2 (experience)

In [17]:
c2 = df_mean.copy()
c2.experience.loc[0] = np.nan
c2

Unnamed: 0,age,experience,salary(K)
0,25,,50
1,27,3.0,90
2,29,5.0,80
3,31,7.0,90
4,33,9.0,100
5,29,11.0,130


In [18]:
test = c2.dropna().copy()
test

Unnamed: 0,age,experience,salary(K)
1,27,3.0,90
2,29,5.0,80
3,31,7.0,90
4,33,9.0,100
5,29,11.0,130


In [19]:
y = test.experience
x = test.drop(columns=["experience"]).values

In [20]:
y

1     3.0
2     5.0
3     7.0
4     9.0
5    11.0
Name: experience, dtype: float64

In [21]:
x

array([[ 27,  90],
       [ 29,  80],
       [ 31,  90],
       [ 33, 100],
       [ 29, 130]])

In [22]:
clf = LinearRegression()
clf.fit(x,y)

In [23]:
pred_value = c2.loc[0].dropna()
pred_value = [pred_value]
pred_value

[age          25.0
 salary(K)    50.0
 Name: 0, dtype: float64]

In [24]:
pred_experence = clf.predict(pred_value)
pred_experence

array([-2.75])

In [25]:
df.experience.loc[0] = pred_experence[0]
df

Unnamed: 0,age,experience,salary(K)
0,25.0,-2.75,50.0
1,27.0,3.0,
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,38.75,11.0,130.0


# Column 3

In [26]:
c3 = df_mean.copy()
c3["salary(K)"].loc[1] = np.nan
c3

Unnamed: 0,age,experience,salary(K)
0,25,7,50.0
1,27,3,
2,29,5,80.0
3,31,7,90.0
4,33,9,100.0
5,29,11,130.0


In [27]:
test = c3.dropna().copy()
test

Unnamed: 0,age,experience,salary(K)
0,25,7,50.0
2,29,5,80.0
3,31,7,90.0
4,33,9,100.0
5,29,11,130.0


In [28]:
y = test["salary(K)"]
x = test.drop(columns=["salary(K)"]).values

In [29]:
y

0     50.0
2     80.0
3     90.0
4    100.0
5    130.0
Name: salary(K), dtype: float64

In [30]:
x

array([[25,  7],
       [29,  5],
       [31,  7],
       [33,  9],
       [29, 11]])

In [31]:
clf = LinearRegression()
clf.fit(x,y)

In [32]:
pred_value = c3.loc[1].dropna()
pred_value = [pred_value]
pred_value

[age           27.0
 experience     3.0
 Name: 1, dtype: float64]

In [33]:
pred_salary = clf.predict(pred_value)
pred_salary

array([40.])

In [34]:
df["salary(K)"].loc[1] = pred_salary[0]
new_data = df.copy()

# new data

In [35]:
new_data

Unnamed: 0,age,experience,salary(K)
0,25.0,-2.75,50.0
1,27.0,3.0,40.0
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,38.75,11.0,130.0


# mean data

In [36]:
df_mean

Unnamed: 0,age,experience,salary(K)
0,25,7,50
1,27,3,90
2,29,5,80
3,31,7,90
4,33,9,100
5,29,11,130


# Now Subtract data

In [37]:
df_mean - new_data

Unnamed: 0,age,experience,salary(K)
0,0.0,9.75,0.0
1,0.0,0.0,50.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,-9.75,0.0,0.0


#  Now Repeat This Technique until Differenece are not close to zero

## Note 
## 0 iter = mean
## 1 iter = new_data
## 2 iter = 2nd_new_data and so on

* ## 0 iter - 1 iter
* ## 1 iter - 2 iter



# Create Own Mice Algorithm

In [38]:
class Mice:
    
    missed = None
    mean_data = None
    new_data = None
    apply_Alg = None
    
    # check arg
    def __init__(self,data,values=None,Alg=None):
        self.data = data
        self.values = values
        self.Alg = Alg
        Mice.mean_data = self.get_mean()
        
        
        # check value argumrnt
        
        if values == None:
            
            # default original data nan values
            Mice.missed = self.get_missed_value()
        
        else:
            Mice.missed = self.values
            
        # check Algorithm argument
        
        if Alg == None:
            # default Algorithm is LinearRegression
            Mice.apply_Alg = LinearRegression()
        
        else:
            Mice.apply_Alg = self.Alg
            
        
        Mice.new_data = self.Algo()
            
    def get_missed_value(self):
        ind = {}
        for i in self.data.columns:
            r = self.data.loc[self.data[i].isnull()].index
            ind.update({i:list(r)})
        return ind
    
    def get_mean(self):
        tmp = self.data.copy()
        for i in tmp.columns:
        
            # add index
            r = self.data.loc[self.data[i].isnull()].index
            tmp[i].fillna(tmp[i].mean(),inplace=True)
        return tmp
        
    
    def Algo(self):
        tmp1 = Mice.mean_data.copy()
        
        for k,v in Mice.missed.items():
            
            for j in v:
        
            # replace again nan with column wise
                tmp1[k].loc[j] = np.nan
            
            # test data 
                test = tmp1.loc[j].dropna()
    
            # split data input and outputs for training 
                y = tmp1.dropna()[k]
                x = tmp1.dropna().drop(columns=k).values
        
            
            # apply algorithm
                lr = self.apply_Alg
            
            # training
                lr.fit(x,y)
            
            # pred
                c = lr.predict(test.values.reshape(1,-1))
                c = np.round(c,1)

        
            # fill pred (c) value with nan
                tmp1.loc[j].fillna(c[0],inplace=True)
                
                
        return tmp1

# Data

In [39]:
df.age.loc[5] = np.nan
df.experience.loc[0] = np.nan
df["salary(K)"].loc[1] = np.nan

df

Unnamed: 0,age,experience,salary(K)
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,,11.0,130.0


# Extract Colmn

In [40]:
nan = Mice(df).missed
nan

{'age': [5], 'experience': [0], 'salary(K)': [1]}

# 1 Iteration 

In [41]:
_1_iter = Mice(df,nan)
_1_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,25.0,4.1,50.0
1,27.0,3.0,68.6
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,38.8,11.0,130.0


In [42]:
_1_iter.mean_data - _1_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,0.0,2.9,0.0
1,0.0,0.0,21.4
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,-9.8,0.0,0.0


# 2 Iteration 

In [43]:
_2_iter = Mice(_1_iter.new_data,nan)
_2_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,25.0,3.9,50.0
1,27.0,3.0,70.0
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,37.4,11.0,130.0


In [44]:
_1_iter.new_data - _2_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0


# 3 Iteration

In [45]:
_3_iter = Mice(_2_iter.new_data,nan)
_3_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,25.0,4.0,50.0
1,27.0,3.0,70.0
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,37.4,11.0,130.0


In [46]:
_2_iter.new_data - _3_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0


# 4 Iteration

In [47]:
_4_iter = Mice(_3_iter.new_data,nan)
_4_iter.new_data

Unnamed: 0,age,experience,salary(K)
0,25.0,4.0,50.0
1,27.0,3.0,70.0
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,37.4,11.0,130.0


# Apply Different Algorithm

# Original Data

In [48]:
data = {
    
    "age":[25,27,29,31,33,35],
    "experience":[1,3,5,7,9,11],
    "salary(K)":[50,70,80,90,100,130],
    
}

org_data = pd.DataFrame(data)
org_data

Unnamed: 0,age,experience,salary(K)
0,25,1,50
1,27,3,70
2,29,5,80
3,31,7,90
4,33,9,100
5,35,11,130


In [49]:
df = org_data.copy()

In [50]:
df.age.iloc[5] = np.nan
df.experience.iloc[0] = np.nan
df["salary(K)"].iloc[1] = np.nan

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,age,experience,salary(K)
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,,11.0,130.0


# LassoCV

In [51]:
org_data

Unnamed: 0,age,experience,salary(K)
0,25,1,50
1,27,3,70
2,29,5,80
3,31,7,90
4,33,9,100
5,35,11,130


In [52]:
from sklearn.linear_model import LassoCV

clf_1 = Mice(df,nan,LassoCV())
clf_1.new_data

Unnamed: 0,age,experience,salary(K)
0,25.0,0.9,50.0
1,27.0,3.0,64.0
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,34.9,11.0,130.0


In [53]:
clf_1.mean_data - clf_1.new_data 

Unnamed: 0,age,experience,salary(K)
0,0.0,6.1,0.0
1,0.0,0.0,26.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,-5.9,0.0,0.0


In [54]:
org_data

Unnamed: 0,age,experience,salary(K)
0,25,1,50
1,27,3,70
2,29,5,80
3,31,7,90
4,33,9,100
5,35,11,130


# Using Mice Algorithm with Sklearn

In [55]:
from sklearn.impute import IterativeImputer

ImportError: cannot import name 'IterativeImputer' from 'sklearn.impute' (/home/sweeterror404/.local/lib/python3.9/site-packages/sklearn/impute/__init__.py)

# Note the is beta version So 
## First, we need to import enable_iterative_imputer which is like a switch so that scikit-learn knows that we want to use the experimental version of Iterative Imputer.

In [56]:
from sklearn.experimental import enable_iterative_imputer

In [57]:
from sklearn.impute import IterativeImputer

# Common Parameters

* ### estimator (Algorithm), default=BayesianRidge()

* ### missing_valuesint or np.nan, default=np.nan

* ### max_iterint, default=10

* ### tolfloat, default=1e-3  

* ### initial_strategy{‘mean’, ‘median’, ‘most_frequent’, ‘constant’}, default=’mean’ (for data type)

* ### imputation_order{‘ascending’, ‘descending’, ‘roman’, ‘arabic’, ‘random’}, default=’ascending’
     <br>   
    
    * ### 'ascending': From features with fewest missing values to most.
    * ### 'descending': From features with most missing values to fewest.
    * ### 'roman': Left to right.
    * ### 'arabic': Right to left.
    * ### 'random': A random order for each round.
<br>
* ### add_indicatorbool, default=False

In [58]:
df

Unnamed: 0,age,experience,salary(K)
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,80.0
3,31.0,7.0,90.0
4,33.0,9.0,100.0
5,,11.0,130.0


In [59]:
algo = LinearRegression()

In [60]:
im = IterativeImputer(verbose=2,max_iter=30,tol=1e-10,imputation_order="roman",initial_strategy="mean")

In [61]:
im.fit(df)

[IterativeImputer] Completing matrix with shape (6, 3)
[IterativeImputer] Ending imputation round 1/30, elapsed time 0.00
[IterativeImputer] Change: 23.5384159257531, scaled tolerance: 1.3e-08 
[IterativeImputer] Ending imputation round 2/30, elapsed time 0.01
[IterativeImputer] Change: 1.943777989240715, scaled tolerance: 1.3e-08 
[IterativeImputer] Ending imputation round 3/30, elapsed time 0.01
[IterativeImputer] Change: 4.869505660753958, scaled tolerance: 1.3e-08 
[IterativeImputer] Ending imputation round 4/30, elapsed time 0.02
[IterativeImputer] Change: 2.9108292318771873, scaled tolerance: 1.3e-08 
[IterativeImputer] Ending imputation round 5/30, elapsed time 0.02
[IterativeImputer] Change: 4.696239084677245, scaled tolerance: 1.3e-08 
[IterativeImputer] Ending imputation round 6/30, elapsed time 0.03
[IterativeImputer] Change: 0.7842013385703694, scaled tolerance: 1.3e-08 
[IterativeImputer] Ending imputation round 7/30, elapsed time 0.03
[IterativeImputer] Change: 0.25079466



In [62]:
new = im.transform(df)

[IterativeImputer] Completing matrix with shape (6, 3)
[IterativeImputer] Ending imputation round 1/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 3/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 4/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 5/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 6/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 7/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 8/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 9/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 10/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 11/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 12/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 13/30, elapsed time 0.00
[IterativeImputer] Ending imputation round 14/30, elapsed time 0.01
[I

In [63]:
org_data

Unnamed: 0,age,experience,salary(K)
0,25,1,50
1,27,3,70
2,29,5,80
3,31,7,90
4,33,9,100
5,35,11,130


In [64]:
new = pd.DataFrame(new,columns=df.columns,dtype=int)
new

Unnamed: 0,age,experience,salary(K)
0,25,1,50
1,27,3,63
2,29,5,80
3,31,7,90
4,33,9,100
5,35,11,130
