# MOVIE RATING PREDICTION WITH PYTHON

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

### loading datset

In [2]:
df = pd.read_csv("IMDb-Movies-India.csv", encoding='latin1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


### Data Preprocessing

In [3]:
#  checking for null values
df.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [4]:
df.shape

(15509, 10)

In [5]:
df['Year'].describe()

count      14981
unique       102
top       (2019)
freq         410
Name: Year, dtype: object

In [6]:
# droping all the rows with nan values.
df.dropna(inplace=True)
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


Now check our dataset becomes smother there is not row with nan value

In [7]:
df.isnull().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

### One hot encoding

In [8]:
# Encode categorical features
lb = LabelEncoder()
df['Genre'] = lb.fit_transform(df['Genre'])
df['Director'] = lb.fit_transform(df['Director'])
df['Actor 1'] = lb.fit_transform(df['Actor 1'])
df['Actor 2'] = lb.fit_transform(df['Actor 2'])
df['Actor 3'] = lb.fit_transform(df['Actor 3'])

In [9]:
# Separate features (X) and target variable (y)
X = df.drop(['Name', 'Rating', 'Year', 'Duration', 'Votes'], axis='columns')
y = df['Rating']

### Train Test Split

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### hyperparameter tuning and best model selection

In [11]:
# Model parameters
model_params = {
    'svm': {
        'model': SVR(),
        'params': {'C': [1, 10, 20], 'kernel': ['linear', 'rbf']}
    },
    'Random_forest': {
        'model': RandomForestRegressor(),
        'params': {'n_estimators': [1, 5, 10, 20, 40]}
    },
    'K_neighbors': {
        'model': KNeighborsRegressor(),
        'params': {'n_neighbors': [5, 10, 15]}
    },
    'Decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {'criterion': ['mse', 'mae']}
    }
}

### Model training and evaluation

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=3, scoring='neg_mean_squared_error')
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    scores.append({
        'model': model_name,
        'mse': mse,
        'best_params': clf.best_params_
    })

df_score = pd.DataFrame(scores, columns=['model', 'mse', 'best_params'])
df_score