In [1]:
# EndTailImputer / End of Distribution Imputation:

# Missing Value is not at random (MNAR) then the information is important, we want to replace missing data with values 
# that are at the tail of the distribution of the variable.

# Note:
# when we do this, the outliers are covered by when we do this imputation.
# End of the distribution means the data points from 3rd deviation. 

#### Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

#### Collecting Data

In [3]:
df = pd.read_csv(r"C:\Users\Mr.MACHINE\Videos\Captures\DATA SCIENCE\PRACTICE\Data PreProcessing\3.Handling Missing Value\WineQuality.csv")

#### Display First 5 records of data

In [4]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


#### Checking Shape of the Dataset.  i.e. how many rows and Column in a Dataset.

In [5]:
df.shape

(6497, 13)

#### Lets check the columnwise non null values and the datatype of features

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


#### Calculating NullValues of each Features

In [7]:
df.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

#### Calculating 3rd Deviation

In [8]:
dev_fixed_acidity = df["fixed acidity"].mean() + 3 * df["fixed acidity"].std()
dev_volatile_acidity = df["volatile acidity"].mean() + 3 * df["volatile acidity"].std()
dev_citric_acid = df["citric acid"].mean() + 3 * df["citric acid"].std()
dev_residual_sugar = df["residual sugar"].mean() + 3 * df["residual sugar"].std()
dev_chlorides = df["chlorides"].mean() + 3 * df["chlorides"].std()
dev_pH = df["pH"].mean() + 3 * df["pH"].std()
dev_sulphates = df["sulphates"].mean() + 3 * df["sulphates"].std()

In [9]:
print(f"3rdDeviation of fixed acidity : {dev_fixed_acidity}")
print(f"3rdDeviation of volatile_acidity : {dev_volatile_acidity}")
print(f"3rdDeviation of citric_acid : {dev_citric_acid}")
print(f"3rdDeviation of residual_sugar : {dev_residual_sugar}")
print(f"3rdDeviation of chlorides : {dev_chlorides}")
print(f"3rdDeviation of pH : {dev_pH}")
print(f"3rdDeviation of sulphates : {dev_sulphates}")

3rdDeviation of fixed acidity : 11.106828882050522
3rdDeviation of volatile_acidity : 0.8336381014976812
3rdDeviation of citric_acid : 0.7545162987377465
3rdDeviation of residual_sugar : 19.71870063294501
3rdDeviation of chlorides : 0.16114964612156837
3rdDeviation of pH : 3.700640419036124
3rdDeviation of sulphates : 0.9776575187309238


#### Replacing the missing value by using Fillna() Function

In [10]:
df["fixed acidity"].fillna(dev_fixed_acidity, inplace = True)
df["volatile acidity"].fillna(dev_volatile_acidity, inplace = True)
df["citric acid"].fillna(dev_citric_acid, inplace = True)
df["residual sugar"].fillna(dev_residual_sugar, inplace = True)
df["chlorides"].fillna(dev_chlorides, inplace = True)
df["pH"].fillna(dev_pH, inplace = True)
df["sulphates"].fillna(dev_sulphates, inplace = True)

In [11]:
df.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### Using Feature-Engine Library

In [12]:
df1 = pd.read_csv(r"C:\Users\Mr.MACHINE\Videos\Captures\DATA SCIENCE\PRACTICE\Data PreProcessing\3.Handling Missing Value\WineQuality.csv")

In [13]:
df1.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [14]:
X = df1.iloc[:,:-1]

In [15]:
X.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [16]:
Y = df1.iloc[:,-1]

In [17]:
Y.head()

0    6
1    6
2    6
3    6
4    6
Name: quality, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=.25)

In [20]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4872, 12) (1625, 12) (4872,) (1625,)


In [21]:
X_train.isnull().sum()

type                    0
fixed acidity           4
volatile acidity        5
citric acid             2
residual sugar          2
chlorides               2
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      5
sulphates               4
alcohol                 0
dtype: int64

In [22]:
from feature_engine.imputation import EndTailImputer

In [23]:
endtail_imputer = EndTailImputer(imputation_method =  'gaussian', tail = 'right', fold = 3) 

In [24]:
endtail_imputer.fit(X_train)

In [25]:
X_train = endtail_imputer.transform(X_train)

In [26]:
X_train.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [27]:
X_test = endtail_imputer.transform(X_test)

In [28]:
X_test.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64