## <b>Data Cleaning</b>

In [1]:
#importing library
import pandas as pd

In [2]:
#loading data
df_imdb = pd.read_csv('data/imdb_movie_cleanup.csv')
df_imdb.head()

Unnamed: 0,Movie title,director name,Duration,title Year
0,Avatar,James Cameron,178.0,2009.0
1,Pirates of the Caribbean: At World's End,Gore Verbinski,169.0,2007.0
2,Spectre,Sam Mendes,148.0,2015.0
3,The Dark Knight Rises,Christopher Nolan,164.0,2012.0
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,,


In [3]:
#replacing all spaces in column names with underscore symbol
df_imdb.columns = df_imdb.columns.str.lower().str.replace(' ','_')
df_imdb.head()

Unnamed: 0,movie_title,director_name,duration,title_year
0,Avatar,James Cameron,178.0,2009.0
1,Pirates of the Caribbean: At World's End,Gore Verbinski,169.0,2007.0
2,Spectre,Sam Mendes,148.0,2015.0
3,The Dark Knight Rises,Christopher Nolan,164.0,2012.0
4,Star Wars: Episode VII - The Force Awakens ...,Doug Walker,,


In [4]:
drop_columns  = ['director_name']
df_imdb.drop(drop_columns,axis=1,inplace=True)
df_imdb.head()

Unnamed: 0,movie_title,duration,title_year
0,Avatar,178.0,2009.0
1,Pirates of the Caribbean: At World's End,169.0,2007.0
2,Spectre,148.0,2015.0
3,The Dark Knight Rises,164.0,2012.0
4,Star Wars: Episode VII - The Force Awakens ...,,


In [5]:
#checking whether there are any null values in the dataset
df_imdb.isnull()

Unnamed: 0,movie_title,duration,title_year
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,True,True
...,...,...,...
5038,False,False,False
5039,False,False,True
5040,False,False,False
5041,False,False,False


In [6]:
#total null values in each column
df_imdb.isnull().sum()

movie_title      0
duration        15
title_year     108
dtype: int64

In [7]:
#using notnull()
df_imdb = df_imdb[df_imdb['duration'].notnull()]
df_imdb.isnull().sum()

movie_title      0
duration         0
title_year     105
dtype: int64

In [8]:
# filling null value with mean value
df_imdb.fillna(df_imdb.mean(),inplace=True)
df_imdb.isnull().sum()

movie_title    0
duration       0
title_year     0
dtype: int64

In [9]:
#handling duplicate data
df_imdb.duplicated()

0       False
1       False
2       False
3       False
5       False
        ...  
5038    False
5039    False
5040    False
5041    False
5042    False
Length: 5028, dtype: bool

In [10]:
#total duplicate values
df_imdb.duplicated().sum()

124

In [11]:
#deleting duplicate values with duplicate function
df_imdb.drop_duplicates(keep ='first', inplace = True)
df_imdb.duplicated().sum()

0

In [12]:
df_imdb['title_year'].head(5)

0    2009.0
1    2007.0
2    2015.0
3    2012.0
5    2012.0
Name: title_year, dtype: float64

In [13]:
# Modifying column data

import math
def cleanup_title_year(ty):
    if math.isnan(ty):
        return math.nan
    else:
        return int(ty)

In [14]:
df_imdb.title_year = df_imdb.title_year.apply(cleanup_title_year)
df_imdb['title_year'].head(5)

0    2009
1    2007
2    2015
3    2012
5    2012
Name: title_year, dtype: int64