<a href="https://colab.research.google.com/github/Jahnavi-2005-hub/CODSOFT/blob/main/Movie_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor



In [None]:
df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [None]:
df.describe()

Unnamed: 0,Year,Rating
count,14981.0,7919.0
mean,-1987.012215,5.841621
std,25.416689,1.381777
min,-2022.0,1.1
25%,-2009.0,4.9
50%,-1991.0,6.0
75%,-1968.0,6.8
max,-1913.0,10.0


In [None]:
df.dtypes

Unnamed: 0,0
Name,object
Year,float64
Duration,object
Genre,object
Rating,float64
Votes,object
Director,object
Actor 1,object
Actor 2,object
Actor 3,object


In [None]:
df.isna().sum()

Unnamed: 0,0
Name,0
Year,528
Duration,8269
Genre,1877
Rating,7590
Votes,7589
Director,525
Actor 1,1617
Actor 2,2384
Actor 3,3144


In [None]:
df.dropna(subset=['Rating'], inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,2068
Genre,102
Rating,0
Votes,0
Director,5
Actor 1,125
Actor 2,200
Actor 3,292


In [None]:
df['Genre'] = df['Genre'].fillna('Unknown')

In [None]:
df.isna().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,2068
Genre,0
Rating,0
Votes,0
Director,5
Actor 1,125
Actor 2,200
Actor 3,292


df.dropna(subset=['Actor 1', 'Actor 2', 'Actor 3'], inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,2068
Genre,0
Rating,0
Votes,0
Director,5
Actor 1,125
Actor 2,200
Actor 3,292


In [None]:
negative_years = df[df['Year'] < 0]
print(f"Number of rows with negative years: {len(negative_years)}")
display(negative_years.head())

Number of rows with negative years: 7919


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [None]:
df['Year'] = df['Year'].abs()
df['Year'] = df['Year'].astype(int)
print("Converted negative years to positive and cast to integer.")
display(df.head())

Converted negative years to positive and cast to integer.


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),2019,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,2019,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,1997,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,2005,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,2012,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [None]:
print(df['Year'].dtypes)

int64


In [None]:
df['Votes'] = df['Votes'].str.replace(',', '', regex=False)
df['Votes'] = pd.to_numeric(df['Votes'])
print("Converted 'Votes' column to numeric.")
display(df.head())

Converted 'Votes' column to numeric.


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),2019,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,2019,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,1997,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,2005,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,2012,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


In [None]:
df.dtypes

Unnamed: 0,0
Name,object
Year,int64
Duration,object
Genre,object
Rating,float64
Votes,int64
Director,object
Actor 1,object
Actor 2,object
Actor 3,object


In [None]:
# Drop any one-hot encoded 'Actor 3' columns.
actor_3_columns = [col for col in df.columns if col.startswith('Actor 3_')]
df = df.drop(actor_3_columns, axis=1)
print("Dropped one-hot encoded 'Actor 3' columns.")
display(df.head())

Dropped one-hot encoded 'Actor 3' columns.


Unnamed: 0,Year,Votes,Name_#Gadhvi (He thought he was Gandhi),Name_#Yaaram,Name_'Ferry',Name_'Sambandh',Name_...Aur Pyaar Ho Gaya,Name_...Yahaan,Name_02-Oct,Name_100 Days,...,Rating_8.8,Rating_8.9,Rating_9.0,Rating_9.1,Rating_9.2,Rating_9.3,Rating_9.4,Rating_9.6,Rating_9.7,Rating_10.0
1,2019,8,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2019,35,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1997,827,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,2005,1086,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
8,2012,326,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
df = pd.get_dummies(data=df,columns=['Director'])

In [None]:
df = pd.get_dummies(data=df,columns=['Genre'])

In [None]:
df = pd.get_dummies(data=df,columns=['Actor 1'])

In [None]:
df = pd.get_dummies(data=df,columns=['Actor 2'])

In [None]:
df = pd.get_dummies(data=df,columns=['Rating'])

In [None]:
df.head()

Unnamed: 0,Year,Votes,Actor 3,Name_#Gadhvi (He thought he was Gandhi),Name_#Yaaram,Name_'Ferry',Name_'Sambandh',Name_...Aur Pyaar Ho Gaya,Name_...Yahaan,Name_02-Oct,...,Rating_8.8,Rating_8.9,Rating_9.0,Rating_9.1,Rating_9.2,Rating_9.3,Rating_9.4,Rating_9.6,Rating_9.7,Rating_10.0
1,2019,8,Arvind Jangid,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2019,35,Siddhant Kapoor,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1997,827,Shammi Kapoor,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
6,2005,1086,Yashpal Sharma,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
8,2012,326,Kiran Bhatia,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:

df = df.drop('Actor 3', axis=1)
rating_columns = [col for col in df.columns if col.startswith('Rating_')]
X = df.drop(columns=rating_columns)

Y = df['Rating_7.0']

print("X and Y defined successfully.")

X and Y defined successfully.


In [None]:
X.head()

Unnamed: 0,Year,Votes,Name_#Gadhvi (He thought he was Gandhi),Name_#Yaaram,Name_'Ferry',Name_'Sambandh',Name_...Aur Pyaar Ho Gaya,Name_...Yahaan,Name_02-Oct,Name_100 Days,...,Actor 2_Zeenat Aman,Actor 2_Zeenat Ibrahim,Actor 2_Zeeshan Khan,Actor 2_Zhangke Jia,Actor 2_Zia Mohiyuddin Dagar,Actor 2_Zoa Morani,Actor 2_Zoya Afroz,Actor 2_Zoya Hussain,Actor 2_Zubeida,Actor 2_Zuber K. Khan
1,2019,8,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2019,35,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1997,827,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,2005,1086,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
8,2012,326,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [None]:
X_train

Unnamed: 0,Year,Votes,Name_#Gadhvi (He thought he was Gandhi),Name_#Yaaram,Name_'Ferry',Name_'Sambandh',Name_...Aur Pyaar Ho Gaya,Name_...Yahaan,Name_02-Oct,Name_100 Days,...,Actor 2_Zeenat Aman,Actor 2_Zeenat Ibrahim,Actor 2_Zeeshan Khan,Actor 2_Zhangke Jia,Actor 2_Zia Mohiyuddin Dagar,Actor 2_Zoa Morani,Actor 2_Zoya Afroz,Actor 2_Zoya Hussain,Actor 2_Zubeida,Actor 2_Zuber K. Khan
7697,2014,68,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13587,2003,136,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7235,2009,6741,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3089,1993,53,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8045,2013,55,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10137,2016,21470,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10449,1970,79,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1527,1984,65,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14938,1998,144,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
model.fit(X_train, Y_train)
print("Model fitted successfully.")

Model fitted successfully.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import classification_report

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.023148148148148147
R-squared (R2): -0.023696682464454888


In [None]:
import pandas as pd

movie = {'Year': 2015,
         'Votes': 130,
         'Name_#Gadhvi (He thought he was Gandhi)': 1,
         'Name_#Yaaram': 0,
         'Name_\'Ferry\'': 0,
         'Name_\'Sambandh\'': 0,
         'Name_...Aur Pyaar Ho Gaya': 0,
         'Actor 1_Rasika Dugal': 1,
         'Actor 1_Prateik': 0,
         'Actor 1_Bobby Deol': 0,
         'Actor 1_Jimmy Sheirgill': 0,
         'Actor 1_Yash Dave': 0
        }

movie_df = pd.DataFrame([movie])

movie_df = movie_df.reindex(columns=X_train.columns, fill_value=0)

display(movie_df)

Unnamed: 0,Year,Votes,Name_#Gadhvi (He thought he was Gandhi),Name_#Yaaram,Name_'Ferry',Name_'Sambandh',Name_...Aur Pyaar Ho Gaya,Name_...Yahaan,Name_02-Oct,Name_100 Days,...,Actor 2_Zeenat Aman,Actor 2_Zeenat Ibrahim,Actor 2_Zeeshan Khan,Actor 2_Zhangke Jia,Actor 2_Zia Mohiyuddin Dagar,Actor 2_Zoa Morani,Actor 2_Zoya Afroz,Actor 2_Zoya Hussain,Actor 2_Zubeida,Actor 2_Zuber K. Khan
0,2015,130,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
predictions = model.predict(X_test)

In [None]:
model.predict(movie_df)

array([False])