In [1]:
# Frequent Category Imputation
# This technique says to replace the missing value with the variable with the highest frequency or in simple words 
# replacing the values with the Mode of that column. This technique is also referred to as Mode Imputation.
#    • Assumptions:-
#        ◦ Data is missing at random.
#        ◦ There is a high probability that the missing data looks like the majority of the data.
#    • Advantages:-
#        ◦ Implementation is easy.
#        ◦ We can obtain a complete dataset in very little time.
#        ◦ We can use this technique in the production model.
#    • Disadvantages:-
#        ◦ The higher the percentage of missing values, the higher will be the distortion.
#        ◦ May lead to over-representation of a particular category.
#        ◦ Can distort original variable distribution.
#    • When to Use:-
#        ◦ Data is Missing at Random(MAR)
#        ◦ Missing data is not more than 5% – 6% of the dataset.

#### Import Required Libraries

In [2]:
import pandas as pd
import numpy as np

#### Loading Data Set

In [3]:
df = pd.read_csv(r"C:\Users\Mr.MACHINE\Videos\Captures\DATA SCIENCE\PRACTICE\Data PreProcessing\3.Handling Missing Value\StudentsPerformance_mv.csv")

#### Display First 5 records of data

In [4]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


#### Cheking Shape of the Dataset.  i.e. how many rows and Column in a Dataset.

In [5]:
df.shape

(1000, 8)

#### Let's check Data set info

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               989 non-null    object
 2   parental level of education  979 non-null    object
 3   lunch                        988 non-null    object
 4   test preparation course      996 non-null    object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


#### Checking The Null Values and CalculateTotal Nullvalues of Dataset

In [7]:
df.isnull().sum()

gender                          0
race/ethnicity                 11
parental level of education    21
lunch                          12
test preparation course         4
math score                      0
reading score                   0
writing score                   0
dtype: int64

### Let's Find what are the Unique Values, how many of ie. number of occurences of unique values, Find which one is most frequent on it.

In [8]:
#Find the unique Value

df['race/ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', nan, 'group E'],
      dtype=object)

In [9]:
# Calculate the value counts of each unique value

df['race/ethnicity'].value_counts()

group C    315
group D    261
group B    187
group E    138
group A     88
Name: race/ethnicity, dtype: int64

In [10]:
# Find the most Frequent value of  a feature.

df['race/ethnicity'].mode()

0    group C
dtype: object

In [11]:
#Find the unique Value

df['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school', nan],
      dtype=object)

In [12]:
# Calculate the value counts of each unique value

df['parental level of education'].value_counts()

associate's degree    219
some college          218
high school           193
some high school      174
bachelor's degree     116
master's degree        59
Name: parental level of education, dtype: int64

In [13]:
# Find the most Frequent value of  a feature.

df['parental level of education'].mode()

0    associate's degree
dtype: object

In [14]:
#Find the unique Value

df['lunch'].unique()

array(['standard', 'free/reduced', nan], dtype=object)

In [15]:
# Calculate the value counts of each unique value

df['lunch'].value_counts()

standard        637
free/reduced    351
Name: lunch, dtype: int64

In [16]:
# Find the most Frequent value of  a feature.

df['lunch'].mode()

0    standard
dtype: object

In [17]:
# Calculate the value counts of each unique value

df['test preparation course'].unique()

array(['none', 'completed', nan], dtype=object)

In [18]:
# Calculate the value counts of each unique value

df['test preparation course'].value_counts()

none         639
completed    357
Name: test preparation course, dtype: int64

In [19]:
# Find the most Frequent value of  a feature.

df['test preparation course'].mode()

0    none
dtype: object

### Replace the Missing Values by Most Frequent category using fillna() Function.

In [20]:
df['race/ethnicity'] = df['race/ethnicity'].fillna(value=df['race/ethnicity'].mode()[0])

df['parental level of education'] = df['parental level of education'].fillna(value=df['parental level of education'].mode()[0])

df['lunch'] = df['lunch'].fillna(value=df['lunch'].mode()[0])

df['test preparation course'] = df['test preparation course'].fillna(value=df['test preparation course'].mode()[0])

In [21]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64