In [1]:
# Arbitrary Value Imputation

# Imputation as it can handle both the Numerical and Categorical variables. 
# This technique states that we group the missing values in a column and assign them to a new value that is 
# far away from the range of that column. 
# Mostly we use values like 99999999 or -9999999 or “Missing” or “Not defined” for numerical & categorical variables. 
# Assumptions:-
#      ◦ Data is not Missing At Random.
#      ◦ The missing data is imputed with an arbitrary value that is not part of the dataset or Mean/Median/Mode of data.
# Advantages:-
#      ◦ Easy to implement.
#      ◦ We can use it in production.
#      ◦ It retains the importance of “missing values” if it exists.
# Disadvantages:-
#      ◦ Can distort original variable distribution.
#      ◦ Arbitrary values can create outliers.
#      ◦ Extra caution required in selecting the Arbitrary value.
# When to Use:-
#      ◦ When data is not MAR(Missing At Random).
#      ◦ Suitable for All.

#### Import Required Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Loading Data Set

In [3]:
df = pd.read_csv(r"C:\Users\Mr.MACHINE\Videos\Captures\DATA SCIENCE\PRACTICE\Data PreProcessing\3.Handling Missing Value\WineQuality.csv")

#### Display First 5 records of data

In [4]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


#### Let's check Data set info

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6497 non-null   object 
 1   fixed acidity         6487 non-null   float64
 2   volatile acidity      6489 non-null   float64
 3   citric acid           6494 non-null   float64
 4   residual sugar        6495 non-null   float64
 5   chlorides             6495 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6488 non-null   float64
 10  sulphates             6493 non-null   float64
 11  alcohol               6497 non-null   float64
 12  quality               6497 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


#### Checking The Null Values and CalculateTotal Nullvalues of Dataset

In [6]:
df.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

#### Cheking Shape of the Dataset.  i.e. how many rows and Column in a Dataset.

In [7]:
df.shape

(6497, 13)

### Let's Divide the dataset into Dependent and Independent variables

In [8]:
x = df.drop("quality", axis = 1)

In [9]:
x.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [10]:
y = df["quality"]

In [11]:
y.head()

0    6
1    6
2    6
3    6
4    6
Name: quality, dtype: int64

#### Let's split the data for train and test

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [13]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(5197, 12) (1300, 12) (5197,) (1300,)


### Imputation by using the library feature-engine

In [14]:
from feature_engine.imputation import ArbitraryNumberImputer

In [15]:
arb_imputer = ArbitraryNumberImputer(arbitrary_number=999)

In [16]:
arb_imputer.fit(X_train)

In [17]:
X_train = arb_imputer.transform(X_train)

In [18]:
X_test = arb_imputer.transform(X_test)

#### Here we can see the variables that are imputed

In [19]:
arb_imputer.variables_

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

#### Here we can see the Arbitary Number assigned to each variable

In [20]:
arb_imputer.imputer_dict_

{'fixed acidity': 999,
 'volatile acidity': 999,
 'citric acid': 999,
 'residual sugar': 999,
 'chlorides': 999,
 'free sulfur dioxide': 999,
 'total sulfur dioxide': 999,
 'density': 999,
 'pH': 999,
 'sulphates': 999,
 'alcohol': 999}

#### For Selected Features only

In [21]:
X1 = df.drop('quality', axis=1)

In [22]:
y1 = df['quality']

In [23]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,ytrain,ytest = train_test_split(X1,y1,test_size=0.20)

In [24]:
arb_imputer_1 = ArbitraryNumberImputer(arbitrary_number=999, variables=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','pH','sulphates'])

In [25]:
arb_imputer_1.fit(Xtrain)

In [26]:
Xtrain = arb_imputer_1.transform(Xtrain)

In [27]:
Xtrain.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
3732,white,7.5,0.28,0.41,1.3,0.044,11.0,126.0,0.99293,3.28,0.45,10.3
4459,white,5.9,0.22,0.18,6.4,0.041,28.0,120.0,0.99403,3.27,0.5,9.9
766,white,6.7,0.5,0.63,13.4,0.078,81.0,238.0,0.9988,3.08,0.44,9.2
2356,white,7.1,0.34,0.32,2.0,0.051,29.0,130.0,0.99354,3.3,0.5,10.4
1076,white,7.5,0.28,0.33,7.7,0.048,42.0,180.0,0.9974,3.37,0.59,10.1


In [28]:
Xtest = arb_imputer_1.transform(Xtest)

In [29]:
Xtest.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
6029,red,5.9,0.19,0.21,1.7,0.045,57.0,135.0,0.99341,3.32,0.44,9.5
1940,white,8.3,0.36,0.57,15.0,0.052,35.0,256.0,1.0001,2.93,0.64,8.6
4269,white,7.1,0.46,0.23,13.7,0.045,44.0,192.0,0.9981,3.11,0.53,9.4
2350,white,7.9,0.31,0.22,13.3,0.048,46.0,212.0,0.99942,3.47,0.59,10.0
1269,white,8.4,0.35,0.56,13.8,0.048,55.0,190.0,0.9993,3.07,0.58,9.4


In [30]:
print(Xtrain.isnull().sum(),"\n\n",Xtest.isnull().sum())

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64 

 type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64


#### Note:
#### If The Variable is Categorical we can replace with "missing" or "not_defined".