# Movie Rating Prediction with Python

# Data Collection

In [49]:
import pandas as pd

# Load the dataset
file_path = 'moviesdata.csv'
movie_data = pd.read_csv(file_path, encoding="latin1")

# Display the first few rows of the dataset
movie_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


# Data Preprocessing

In [50]:
movie_data.dropna(subset=["Rating"], inplace = True)
movie_data.isnull().sum()

movie_data.dropna(subset=['Actor 1','Actor 2','Actor 3','Director','Genre'],inplace=True)
movie_data.isnull().sum()

movie_data.head()

# convert votes columns



Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [51]:

from sklearn.preprocessing import StandardScaler


# Clean 'Votes' column
movie_data['Votes'] = movie_data['Votes'].astype(str).str.replace(',', '', regex=False)
movie_data['Votes'] = pd.to_numeric(movie_data['Votes'], errors='coerce')

# Clean 'Duration' column
movie_data['Duration'] = movie_data['Duration'].astype(str).str.replace('min', '', regex=False)
movie_data['Duration'] = pd.to_numeric(movie_data['Duration'], errors='coerce')
movie_data['Duration'].fillna(movie_data['Duration'].median(), inplace=True)

# One-Hot Encoding for categorical variables
movie_data = pd.get_dummies(movie_data, columns=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
movie_data[['Duration', 'Votes']] = scaler.fit_transform(movie_data[['Duration', 'Votes']])

# Check for missing values again
print("\nMissing values after preprocessing:")
print(movie_data.isnull().sum())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movie_data['Duration'].fillna(movie_data['Duration'].median(), inplace=True)



Missing values after preprocessing:
Name                      0
Year                      0
Duration                  0
Rating                    0
Votes                     0
                         ..
Actor 3_Zeishan Quadri    0
Actor 3_Zenobia Shroff    0
Actor 3_Zohra             0
Actor 3_Zoya Hussain      0
Actor 3_Zulfi Sayed       0
Length: 11601, dtype: int64


Name                      0
Year                      0
Duration                  0
Rating                    0
Votes                     0
                         ..
Actor 3_Zeishan Quadri    0
Actor 3_Zenobia Shroff    0
Actor 3_Zohra             0
Actor 3_Zoya Hussain      0
Actor 3_Zulfi Sayed       0
Length: 11601, dtype: int64