# Dealing with missing values

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/movie_scores.csv')
df.head()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [3]:
# Total number of missing values in each column
df.isnull().sum()

first_name          1
last_name           1
age                 1
sex                 1
pre_movie_score     2
post_movie_score    2
dtype: int64

We can fill the NAN values with zero, but unfortunately, that is not a good approach as zero won't be logical in many cases, like a person's age.

In [4]:
df_zero = df.copy()
df_zero = df_zero.fillna(0)
df_zero.isnull().sum()

first_name          0
last_name           0
age                 0
sex                 0
pre_movie_score     0
post_movie_score    0
dtype: int64

### We can fill the NAN values with the mean, median and mode of the column.

In [5]:
df_mean = df.copy()

df_mean['pre_movie_score'].fillna(df_mean['pre_movie_score'].mean(), inplace=True)
df_mean['post_movie_score'].fillna(df_mean['post_movie_score'].mean(), inplace=True)

df_mean

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,7.0,9.0
2,Hugh,Jackman,51.0,m,7.0,9.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [6]:
df_mode = df.copy()

df_mode['pre_movie_score'].fillna(df_mode['pre_movie_score'].mode(), inplace=True)
df_mode['post_movie_score'].fillna(df_mode['post_movie_score'].mode(), inplace=True)

df_mode

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,7.0,9.0
2,Hugh,Jackman,51.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [7]:
df_median = df.copy()

df_median['pre_movie_score'].fillna(df_median['pre_movie_score'].median(), inplace=True)
df_median['post_movie_score'].fillna(df_median['post_movie_score'].median(), inplace=True)

df_median

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,7.0,9.0
2,Hugh,Jackman,51.0,m,7.0,9.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### We can also use Interpolation to fill missing values; we usually use those techniques when the previous ways do not fit our needs.

The first one we will try is the Linear Inerpolation, as the name suggests it replace the missing value by checking the other values in a single line increasingly.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html

In [8]:
df_linear = df.copy()

df_linear = df_linear.interpolate(method="linear")

df_linear.isnull().sum()

first_name          1
last_name           1
age                 0
sex                 1
pre_movie_score     0
post_movie_score    0
dtype: int64

In [9]:
df_linear

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,57.0,,7.333333,9.333333
2,Hugh,Jackman,51.0,m,6.666667,8.666667
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


You can also decide the direction, Forward and Backward

In [10]:
forward = df.copy()
backward = df.copy()
 
backward = backward.interpolate(method ='linear', limit_direction ='backward')
#  or
forward = forward.interpolate(method ='linear', limit_direction ='forward')

In [11]:
forward

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,57.0,,7.333333,9.333333
2,Hugh,Jackman,51.0,m,6.666667,8.666667
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [12]:
backward

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,57.0,,7.333333,9.333333
2,Hugh,Jackman,51.0,m,6.666667,8.666667
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### The next Interpolation is the Polynomial Interpolation, which fills the NAN with the lowest curve.

you need to specify that order=2 otherwise it will just be linear.

In [13]:
poly = df.copy()
poly = poly.interpolate(method="polynomial", order=2)
poly

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,46.631579,,6.5,8.5
2,Hugh,Jackman,51.0,m,5.833333,7.833333
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### Fill nan values with rolling mean

In [14]:
df_rm = df.copy()
df_rm = df_rm.fillna(df_rm.rolling(6, min_periods=1).mean())
df_rm

  df_rm = df_rm.fillna(df_rm.rolling(6, min_periods=1).mean())


Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,63.0,,8.0,10.0
2,Hugh,Jackman,51.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


### Using KNN to fill missing values

Nearest neighbors imputation fills the nan values based on the values from the nearest neighbors by comparing them to the none missing values of their neighbors and filling it with their average, you can choose the number of neigbors.


https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

https://scikit-learn.org/stable/modules/impute.html

https://towardsdatascience.com/the-use-of-knn-for-missing-values-cf33d935c637

In [16]:
from sklearn.impute import KNNImputer
df_knn = df.copy(deep=True)

knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
df_knn['pre_movie_score'] = knn_imputer.fit_transform(df_knn[['pre_movie_score']])
df_knn['post_movie_score'] = knn_imputer.fit_transform(df_knn[['post_movie_score']])

In [17]:
df_knn

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,7.0,9.0
2,Hugh,Jackman,51.0,m,7.0,9.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0
