In [1]:
## Objective
# Prepare the movie dataset for analysis by:
# - Handling missing values
# - Fixing data types
# - Cleaning rating and vote columns
# - Saving clean dataset


In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load Data
df = pd.read_csv("../data/indian movies.csv")
df.head()

Unnamed: 0,ID,Movie Name,Year,Timing(min),Rating(10),Votes,Genre,Language
0,tt0398974,Dr. Shaitan,1960,-,-,-,-,hindi
1,tt1702558,Nadir Khan,1968,-,-,-,-,urdu
2,tt0493437,Apna Sapna Money Money,2006,134 min,5.3,1892,"Comedy, Musical, Romance",hindi
3,tt0273405,Aag Aur Sholay,1987,-,2.2,20,-,urdu
4,tt0049595,Parivar,1956,-,7.4,21,"Comedy, Drama, Family",hindi


In [4]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50602 entries, 0 to 50601
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           50602 non-null  object
 1   Movie Name   50602 non-null  object
 2   Year         49041 non-null  object
 3   Timing(min)  50602 non-null  object
 4   Rating(10)   50602 non-null  object
 5   Votes        50602 non-null  object
 6   Genre        50602 non-null  object
 7   Language     50602 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


(50602, 8)

In [5]:
# Replacement
df.replace("-", np.nan, inplace=True)

In [6]:
# Checking Missing
df.isnull().sum()

ID              2508
Movie Name         0
Year            1561
Timing(min)    25892
Rating(10)     26926
Votes          26925
Genre           8581
Language           0
dtype: int64

In [7]:
#Cleaning Rating Column
df['Rating(10)'] = pd.to_numeric(df['Rating(10)'], errors='coerce')

In [8]:
# Cleaning Votes Column
df['Votes'] = df['Votes'].str.replace(',', '')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

In [9]:
#Clening Timing
df['Timing(min)'] = df['Timing(min)'].str.replace(' min', '')
df['Timing(min)'] = pd.to_numeric(df['Timing(min)'], errors='coerce')

In [10]:
# Convert Year
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [11]:
# Dropping Rows With Missing Values
df = df.dropna(subset=['Rating(10)', 'Votes'])

In [12]:
# Save Clean DataSet
df.to_csv("clean_movies.csv", index=False)