In [22]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,mean_absolute_error,r2_score,mean_squared_error
from sklearn.preprocessing import StandardScaler



In [23]:
df = pd.read_csv(r"K:\\Coding\\movie_rating\\IMDb Movies India.csv",encoding='latin')


In [24]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [25]:
df.shape

(15509, 10)

In [26]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [27]:
df.isnull().sum().sum()

33523

In [28]:
df.dropna(inplace=True)


In [29]:
df.isnull().sum().sum()

0

In [30]:
df.shape

(5659, 10)

In [31]:
df['Year']=df['Year'].str.extract(r'([0-9].{0,3})',expand=False)

##[0-9]: Matches any single digit from 0 to 9.
##.{0,3}: Matches any character (except newline) between 0 and 3 times. 
## Combined with [0-9], this pattern matches a digit followed by up to three additional characters.

In [32]:
'''
[0-9]: Matches any single digit from 0 to 9.
+: Matches one or more of the preceding element (i.e., [0-9]), meaning it will match sequences of one or more digits.
'''

df['Duration'] = df['Duration'].str.extract(r'([0-9]+)', expand=False)

In [33]:
df['Duration']=pd.to_numeric(df['Duration'])
df['Year']=pd.to_numeric(df['Year'])

In [34]:
df['Votes']=df['Votes'].str.extract(r'([0-9]+)',expand=False)
df['Votes']=pd.to_numeric(df['Votes'])


In [36]:
# Define feature and target variables
X = df[['Year', 'Actor 1', 'Actor 2', 'Duration', 'Genre', 'Votes', 'Director']]
y = df['Rating']
print(X)

# Preprocessing for categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Actor 1', 'Actor 2', 'Genre', 'Director']),
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler())
        ]), ['Year', 'Duration', 'Votes'])
    ])

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])



       Year          Actor 1                 Actor 2  Duration  \
1      2019     Rasika Dugal          Vivek Ghamande       109   
3      2019          Prateik              Ishita Raj       110   
5      1997       Bobby Deol  Aishwarya Rai Bachchan       147   
6      2005  Jimmy Sheirgill          Minissha Lamba       142   
8      2012        Yash Dave          Muntazir Ahmad        82   
...     ...              ...                     ...       ...   
15493  2015    Vicky Kaushal         Sarah Jane Dias       115   
15494  2001   Karisma Kapoor                   Rekha       153   
15503  1989      Chiranjeevi              Jayamalini       125   
15505  1999     Akshay Kumar          Twinkle Khanna       129   
15508  1998       Dharmendra              Jaya Prada       130   

                           Genre  Votes         Director  
1                          Drama      8    Gaurav Bakshi  
3                Comedy, Romance     35       Ovais Khan  
5         Comedy, Drama, Music

In [37]:
# Split the data
df.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

#Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 4.693573540345248


In [40]:

new_movie = pd.DataFrame({
    'Year': [2019],
    'Actor 1': ['Prateik'],
    'Actor 2': ['Ishita Raj'],
    'Duration': [110],
    'Genre': ['Comedy, Romance'],
    'Votes': [35],
    'Director': ['Ovais Khan']
})
		
# Predict the rating
predicted_rating = model.predict(new_movie)
print(f'Predicted Rating: {predicted_rating[0]}')

Predicted Rating: 4.400114108100651


In [41]:
import joblib

# Save the trained model to a file
joblib_file = "movie_rating_prediction.joblib"
joblib.dump(model, joblib_file)


['movie_rating_prediction.joblib']