**IMPORTING THE LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import scipy.stats as stats
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
import warnings

warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = [10,5]

import warnings
# To Ignore Warning
warnings.simplefilter(action = "ignore", category = FutureWarning)

**LOADING THE DATA**

In [None]:
df=pd.read_csv("/content/IMDb Movies India.csv",encoding="latin1")
df.head(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


**UNDERSTANDING THE DATA**

In [None]:
df.shape

(15509, 10)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Name,0
Year,528
Duration,8269
Genre,1877
Rating,7590
Votes,7589
Director,525
Actor 1,1617
Actor 2,2384
Actor 3,3144


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [None]:
# Finding all the rows with missing values

nulls = df[df.iloc[:, 1:9].isna().all(axis=1)]
nulls.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1836,Bang Bang Reloaded,,,,,,,,,
1920,Battle of bittora,,,,,,,,,
2653,Campus,,,,,,,,,
3403,Dancing Dad,,,,,,,,,
3807,Dial 100,,,,,,,,,


In [None]:
#Checking if there are any typos errrs

for col in df.select_dtypes(include = "object"):
    print(f"Name of the column that are available: {col}")
    print(df[col].unique())
    print('\n', '-'*60, '\n')

Name of the column that are available: Name
[' ' '#Gadhvi (He thought he was Gandhi)' '#Homecoming' ... 'Zulmi Raj'
 'Zulmi Shikari' 'Zulm-O-Sitam']

 ------------------------------------------------------------ 

Name of the column that are available: Year
[nan '(2019)' '(2021)' '(2010)' '(1997)' '(2005)' '(2008)' '(2012)'
 '(2014)' '(2004)' '(2016)' '(1991)' '(1990)' '(2018)' '(1987)' '(1948)'
 '(1958)' '(2017)' '(2020)' '(2009)' '(2002)' '(1993)' '(1946)' '(1994)'
 '(2007)' '(2013)' '(2003)' '(1998)' '(1979)' '(1951)' '(1956)' '(1974)'
 '(2015)' '(2006)' '(1981)' '(1985)' '(2011)' '(2001)' '(1967)' '(1988)'
 '(1995)' '(1959)' '(1996)' '(1970)' '(1976)' '(2000)' '(1999)' '(1973)'
 '(1968)' '(1943)' '(1953)' '(1986)' '(1983)' '(1989)' '(1982)' '(1977)'
 '(1957)' '(1950)' '(1992)' '(1969)' '(1975)' '(1947)' '(1972)' '(1971)'
 '(1935)' '(1978)' '(1960)' '(1944)' '(1963)' '(1940)' '(1984)' '(1934)'
 '(1955)' '(1936)' '(1980)' '(1966)' '(1949)' '(1962)' '(1964)' '(1952)'
 '(1933)' '(1942)

**HANDLING THE NULL VALUES**

In [None]:
# Drop null values
df.dropna(subset=['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)

In [None]:
# Replacing the brackets
df['Name'] = df['Name'].str.extract('([A-Za-z\s\'\-]+)')

In [None]:
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(r' min', '', regex=True), errors='coerce')

In [None]:
# Splitting the genre by , to keep only unique genres and replacing the null values with mode

df['Genre'] = df['Genre'].str.split(', ')
df = df.explode('Genre')
df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)

In [None]:
# Convert Votes column to numeric

df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', ''), errors='coerce')

In [None]:
df.head(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,(2019),109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,Yaaram,(2019),110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,Yaaram,(2019),110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,(1997),147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,(1997),147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [None]:
# Checking duplicate values

duplicate = df.groupby(['Name', 'Year']).filter(lambda x: len(x) > 1)
duplicate.head(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
3,Yaaram,(2019),110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,Yaaram,(2019),110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,(1997),147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,(1997),147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,(1997),147,Musical,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [None]:
# Dropping the duplicated values by Name

df=df.drop_duplicates(subset=["Name"], keep=False)

**EXPLORATORY DATA ANALYSIS**

In [None]:
df.describe()

Unnamed: 0,Duration,Rating,Votes
count,1528.0,1528.0,1528.0
mean,123.823953,5.976243,552.479712
std,25.108144,1.412547,4311.631841
min,45.0,1.6,5.0
25%,107.0,5.1,14.0
50%,126.0,6.1,34.0
75%,140.0,7.0,127.25
max,300.0,9.4,101014.0


In [None]:
df.describe(include = 'O')

Unnamed: 0,Name,Year,Genre,Director,Actor 1,Actor 2,Actor 3
count,1528,1528,1528,1528,1528,1528,1528
unique,1528,90,20,1114,1010,1131,1154
top,Gadhvi,(2017),Drama,Kanti Shah,Mithun Chakraborty,Mithun Chakraborty,Pran
freq,1,102,789,13,22,12,16


In [None]:
# INFORMATION ABOUT THE VOTES:

# Find the row with the highest number of votes
max_votes_row = df[df['Votes'] == df['Votes'].max()]

# Get the name of the movie with the highest votes
movie_highest_votes = max_votes_row['Name'].values[0]

# Find the number of votes for the movie with the highest votes
votes_highest_votes = max_votes_row['Votes'].values[0]

print("Movie with the highest votes:", movie_highest_votes)
print("Number of votes for the movie with the highest votes:", votes_highest_votes)
print('\n', '='*100, '\n')


# Find the row with the lowest number of votes
min_votes_row = df[df['Votes'] == df['Votes'].min()]

# Get the name of the movie with the lowest votes
movie_lowest_votes = min_votes_row['Name'].values[0]

# Find the number of votes for the movie with the lowest votes
votes_lowest_votes = min_votes_row['Votes'].values[0]

print("Movie with the highest votes:", movie_lowest_votes)
print("Number of votes for the movie with the highest votes:", votes_lowest_votes)


Movie with the highest votes: My Name Is Khan
Number of votes for the movie with the highest votes: 101014


Movie with the highest votes: Anmol Sitaare
Number of votes for the movie with the highest votes: 5


In [None]:
# INFORMATION ABOUT THE RATING:

# Find the row with the highest rating
max_rating_row = df[df['Rating'] == df['Rating'].max()]
movie_highest_rating = max_rating_row['Name'].values[0]
votes_highest_rating = max_rating_row['Votes'].values[0]

print("Movie with the highest rating:", movie_highest_rating)
print("Number of votes for the movie with the highest rating:", votes_highest_rating)
print('\n', '='*100, '\n')


# Find the row with the lowest rating
min_rating_row = df[df['Rating'] == df['Rating'].min()]
movie_lowest_rating = min_rating_row['Name'].values[0]
votes_lowest_rating = min_rating_row['Votes'].values[0]

print("Movie with the highest rating:", movie_lowest_rating)
print("Number of votes for the movie with the highest rating:", votes_lowest_rating)

Movie with the highest rating: June
Number of votes for the movie with the highest rating: 18


Movie with the highest rating: Mumbai Can Dance Saalaa
Number of votes for the movie with the highest rating: 43


In [None]:
# INFORMATION ABOUT THE DIRECTOR:

# Group the dataset by the 'Director' column and count the number of movies each director has directed
director_counts = df['Director'].value_counts()

# Find the director with the highest number of movies directed
most_prolific_director = director_counts.idxmax()
num_movies_directed = director_counts.max()

print("Director with the most movies directed:", most_prolific_director)
print("Number of movies directed by", most_prolific_director, ":", num_movies_directed)
print('\n', '='*100, '\n')


# Group the dataset by the 'Director' column and count the number of movies each director has directed
director_counts = df['Director'].value_counts()

# Find the director with the lowest number of movies directed
least_prolific_director = director_counts.idxmin()
num_movies_directed = director_counts.min()

print("Director with the most movies directed:", least_prolific_director)
print("Number of movies directed by", most_prolific_director, ":", num_movies_directed)

Director with the most movies directed: Kanti Shah
Number of movies directed by Kanti Shah : 13


Director with the most movies directed: Sikandar Khanna
Number of movies directed by Kanti Shah : 1


**DATA VISUALIZATION**

In [None]:
# DISTRIBUTION OF YEAR:

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#4a95c9', '#74b3de', '#9ed2f3', '#bcbd22', '#17becf']

# Create histogram plot using Plotly Express
fig_year = px.histogram(df, x='Year', histnorm='probability density', nbins=30, color_discrete_sequence=colors)
fig_year.update_traces(selector=dict(type='histogram'))
fig_year.update_layout(
    title='Distribution of Year',
    title_x=0.5,
    title_pad=dict(t=20),
    title_font=dict(size=20),
    xaxis_title='Year',
    yaxis_title='Probability Density',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    bargap=0.02,
    plot_bgcolor='dark blue'
)

In [None]:
# DISTRIBUTION OF TIME DURATION:

fig_duration = px.histogram(df, x = 'Duration', histnorm='probability density', nbins = 40, color_discrete_sequence = colors)
fig_duration.update_traces(selector=dict(type='histogram'))
fig_duration.update_layout(title='Distribution of Duration', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'dark blue')
fig_duration.show()

In [None]:
# DISTRIBUTION OF RATING:

fig_rating = px.histogram(df, x = 'Rating', histnorm='probability density', nbins = 40, color_discrete_sequence = colors)
fig_rating.update_traces(selector=dict(type='histogram'))
fig_rating.update_layout(title='Distribution of Rating', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Rating', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'dark blue')
fig_rating.show()

In [None]:
# DISTRIBUTION OF VOTES:

fig_votes = px.box(df, x = 'Votes', color_discrete_sequence = colors)
fig_votes.update_layout(title='Distribution of Votes', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Votes', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'dark blue')
fig_votes.show()

In [None]:
# Top 20 Years Average Rating:

year_avg_rating = df.groupby('Year')['Rating'].mean().reset_index()

top_5_years = year_avg_rating.nlargest(20, 'Rating')
fig = px.bar(top_5_years, x='Year', y='Rating', title='Top 20 Years by Average Rating', color = "Rating", color_continuous_scale = "darkmint")
fig.update_xaxes(type='category')
fig.update_layout(xaxis_title='Year', yaxis_title='Average Rating', plot_bgcolor = 'white')
fig.show()

In [None]:
# TREDNS IN RATING ACROSS  YEAR:

# Group data by Year and calculate the average rating
average_rating_by_year = df.groupby('Year')['Rating'].mean().reset_index()

# Create the line plot with Plotly Express
fig = px.line(average_rating_by_year, x='Year', y='Rating', color_discrete_sequence=['#559C9E'])
fig.update_layout(title='Are there any trends in ratings across year?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Year', yaxis_title='Rating', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig.show()

In [None]:
# Group data by Year and calculate the average rating
average_rating_by_year = df.groupby('Year')['Votes'].mean().reset_index()

# Create the line plot with Plotly Express
fig = px.line(average_rating_by_year, x='Year', y='Votes', color_discrete_sequence=['#559C9E'])
fig.update_layout(title='Are there any trends in votes across year?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Year', yaxis_title='Votes', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig.show()

In [None]:
# Group data by Year and calculate the average rating
average_rating_by_year = df.groupby(['Year', 'Genre'])['Rating'].mean().reset_index()

top_5_genres = df['Genre'].value_counts().head(5).index

# Filter the data to include only the top 5 genres
average_rating_by_year = average_rating_by_year[average_rating_by_year['Genre'].isin(top_5_genres)]
fig = px.bar(average_rating_by_year, x='Year', y='Rating', color='Genre', barmode='group', color_discrete_sequence=['#559C9E', '#0B1F26', '#00CC96', '#2ca02c', '#9467bd'])

fig.update_layout(
    title='Average Rating by Year for Top 5 Genres',
    xaxis_title='Year',
    yaxis_title='Average Rating',
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    plot_bgcolor='white'
)

fig.show()

In [None]:
# Impact of Movie length on Rating:

fig_dur_rat = px.scatter(df, x = 'Duration', y = 'Rating', trendline='ols', color = "Rating", color_continuous_scale = "darkmint")
fig_dur_rat.update_layout(title='Does length of movie have any impact on rating?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration of Movie in Minutes', yaxis_title='Rating of a movie', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig_dur_rat.show()

In [None]:
# Impact of Movie length on Votes:

fig_dur_votes = px.scatter(df, x = 'Duration', y = 'Votes', trendline='ols', color = "Votes", color_continuous_scale = "darkmint")
fig_dur_votes.update_layout(title='Does length of movie have any impact on Votes?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration of Movie in Minutes', yaxis_title='Votes of a movie', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig_dur_votes.show()

In [None]:
# Impact of Movie Rating on Votes:

fig_rat_votes = px.scatter(df, x = 'Rating', y = 'Votes', trendline='ols', color = "Votes", color_continuous_scale = "darkmint")
fig_rat_votes.update_layout(title='Does Ratings of movie have any impact on Votes?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Ratings of Movies', yaxis_title='Votes of movies', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig_rat_votes.show()

**DATA PROCESSING FOR MODEL BUILDING**

In [None]:
# DROPPING UNWANTED COLUMNS:

df.drop('Name', axis = 1, inplace = True)

FEATURE EXTRACTION

In [None]:
# Feature extraction
df['Genre_mean_rating'] = df.groupby('Genre')['Rating'].transform('mean')
df['Director_encoded'] = df.groupby('Director')['Rating'].transform('mean')
df['Actor1_encoded'] = df.groupby('Actor 1')['Rating'].transform('mean')
df['Actor2_encoded'] = df.groupby('Actor 2')['Rating'].transform('mean')
df['Actor3_encoded'] = df.groupby('Actor 3')['Rating'].transform('mean')

# Ensure all columns used for training are numeric

df['Year'] = df['Year'].str.replace('[()]', '', regex=True)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Genre_mean_rating'] = pd.to_numeric(df['Genre_mean_rating'], errors='coerce')
df['Director_encoded'] = pd.to_numeric(df['Director_encoded'], errors='coerce')
df['Actor1_encoded'] = pd.to_numeric(df['Actor1_encoded'], errors='coerce')
df['Actor2_encoded'] = pd.to_numeric(df['Actor2_encoded'], errors='coerce')
df['Actor3_encoded'] = pd.to_numeric(df['Actor3_encoded'], errors='coerce')
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

In [None]:
# Drop rows with missing values
df.dropna(inplace=True)

In [None]:
# Splitting data for model training
X = df[['Year', 'Votes', 'Duration', 'Genre_mean_rating', 'Director_encoded', 'Actor1_encoded', 'Actor2_encoded', 'Actor3_encoded']]
y = df['Rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

**MACHINE LEARNING MODEL**

In [None]:
# LINEAR REGRESSION
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [None]:
# RANDOM FOREST
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

**MODEL EVALUATION**

In [None]:
# Evaluating the performance of trained algos

print('The performance evaluation of Logistic Regression is below: ', '\n')
print('Mean squared error: ',mean_squared_error(y_test, lr_pred))
print('Mean absolute error: ',mean_absolute_error(y_test, lr_pred))
print('R2 score: ',r2_score(y_test, lr_pred))
print('\n', '='*100, '\n')

print('The performance evaluation of Random Forest Regressor is below: ', '\n')
print('Mean squared error: ',mean_squared_error(y_test, rf_pred))
print('Mean absolute error: ',mean_absolute_error(y_test, rf_pred))
print('R2 score: ',r2_score(y_test, rf_pred))

The performance evaluation of Logistic Regression is below:  

Mean squared error:  0.1397803872882309
Mean absolute error:  0.271469508765844
R2 score:  0.926720988593957


The performance evaluation of Random Forest Regressor is below:  

Mean squared error:  0.11910519934640536
Mean absolute error:  0.19447385620915086
R2 score:  0.9375598291666836


**MODEL PREDICTION**

In [None]:
X.head()

Unnamed: 0,Year,Votes,Duration,Genre_mean_rating,Director_encoded,Actor1_encoded,Actor2_encoded,Actor3_encoded
1,2019,8,109,6.420152,7.0,6.85,7.0,7.0
10,2004,17,96,6.420152,6.2,5.766667,5.1,6.2
11,2016,59,120,4.698529,5.9,5.9,5.9,5.9
30,2005,1002,116,6.420152,6.525,6.9,6.866667,5.7
32,1993,15,168,6.420152,5.4,5.6,6.4,5.825


In [None]:
y.head()

Unnamed: 0,Rating
1,7.0
10,6.2
11,5.9
30,7.1
32,5.6


In [None]:
data = {'Year': [2005], 'Votes': [1002], 'Duration': [116], 'Genre_mean_rating': [6.4], 'Director_encoded': [6.5], 'Actor1_encoded': [6.9], 'Actor2_encoded': [6.8], 'Actor3_encoded': [5.700]}
df = pd.DataFrame(data)

In [None]:
predicted_rating_lr = rf.predict(df)

# Display the predicted rating
print("Predicted Rating:", predicted_rating_lr[0])

Predicted Rating: 6.850000000000006


In [None]:
predicted_rating_rf = rf.predict(df)

# Display the predicted rating
print("Predicted Rating:", predicted_rating_rf[0])

Predicted Rating: 6.850000000000006


**SECTION-2 OF MODEL BUILDING ANOTHER APROACH**

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=0.25,random_state=21)

In [None]:
xtrain.shape

(1146, 8)

In [None]:
xtest.shape

(382, 8)

In [None]:
ytrain.shape

(1146,)

In [None]:
ytest.shape

(382,)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

In [None]:
dct = {
    'Linear':LinearRegression(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'GradientBoosting':GradientBoostingRegressor(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR()
}

In [None]:
dct.items()

dict_items([('Linear', LinearRegression()), ('DecisionTree', DecisionTreeRegressor()), ('RandomForest', RandomForestRegressor()), ('GradientBoosting', GradientBoostingRegressor()), ('KNN', KNeighborsRegressor()), ('SVR', SVR())])

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

In [None]:
train_mse = []
train_r2 = []
test_mse = []
test_r2 = []
train_cv = []

for name,model in dct.items():
    # fit the model
    m = model.fit(xtrain,ytrain)
    ypred_train = m.predict(xtrain)
    ypred_test = m.predict(xtest)
    # calculate MSE
    mse_train = mean_squared_error(ytrain,ypred_train)
    mse_test = mean_squared_error(ytest,ypred_test)
    # caluclate R2
    r2_train = (r2_score(ytrain,ypred_train))*100
    r2_test = (r2_score(ytest,ypred_test))*100
    # calculate cross validated scores
    cv = cross_val_score(m,xtrain,ytrain,cv=5,scoring='r2')
    scores = (cv.mean())*100

    # add these values to the respective list to compare the output
    train_mse.append(mse_train)
    train_r2.append(r2_train)
    test_mse.append(mse_test)
    test_r2.append(r2_test)
    train_cv.append(scores)

    # print the results
    print(f'Scores for {name}')
    print("Training Scores")
    print(f'MSE:{train_mse}')
    print(f'R2:{train_r2}')
    print("Testing Scores")
    print(f'MSE:{test_mse}')
    print(f'R2:{test_r2}')
    print(f'CV:{train_cv}')


Scores for Linear
Training Scores
MSE:[0.1473025948894715]
R2:[92.88104686773477]
Testing Scores
MSE:[0.12485564838312754]
R2:[92.93456125517588]
CV:[92.64126146562805]
Scores for DecisionTree
Training Scores
MSE:[0.1473025948894715, 2.7844174185157006e-31]
R2:[92.88104686773477, 100.0]
Testing Scores
MSE:[0.12485564838312754, 0.1910994764397906]
R2:[92.93456125517588, 89.18589857616917]
CV:[92.64126146562805, 87.46378141033414]
Scores for RandomForest
Training Scores
MSE:[0.1473025948894715, 2.7844174185157006e-31, 0.01729279930191962]
R2:[92.88104686773477, 100.0, 99.16426029121614]
Testing Scores
MSE:[0.12485564838312754, 0.1910994764397906, 0.09846025916230369]
R2:[92.93456125517588, 89.18589857616917, 94.42824622738675]
CV:[92.64126146562805, 87.46378141033414, 93.46568072795016]
Scores for GradientBoosting
Training Scores
MSE:[0.1473025948894715, 2.7844174185157006e-31, 0.01729279930191962, 0.06573826075366622]
R2:[92.88104686773477, 100.0, 99.16426029121614, 96.82295075892495]
T

In [None]:
res = {'Name':list(dct.keys()),
       'MSE Training Scores':train_mse,
       'MSE Testing Scores':test_mse,
       'R2 Training Scores':train_r2,
       'R2 Testing Scores':test_r2,
       'CV Training Scores':train_cv}

In [None]:
df_res = pd.DataFrame(res)
df_res.sort_values('CV Training Scores',ascending=False)

Unnamed: 0,Name,MSE Training Scores,MSE Testing Scores,R2 Training Scores,R2 Testing Scores,CV Training Scores
2,RandomForest,0.0172928,0.09846,99.16426,94.428246,93.465681
3,GradientBoosting,0.06573826,0.115274,96.822951,93.476754,92.665086
0,Linear,0.1473026,0.124856,92.881047,92.934561,92.641261
1,DecisionTree,2.784417e-31,0.191099,100.0,89.185899,87.463781
4,KNN,0.9050283,1.377036,56.261097,22.075126,27.379528
5,SVR,2.053679,1.809281,0.748222,-2.385134,-0.604964


In [None]:
# Lets consider Random Forest Regressor as its giving good results

params = {'n_estimators':[200,300],
          'max_depth':[5,6,7,8],
          'min_samples_split':[2,3,4,5,6],
          'criterion':['squared_error','absolute_error']}

In [None]:
rfr = RandomForestRegressor()
rscv = RandomizedSearchCV(rfr,params,cv=3,scoring='neg_mean_squared_error')
rscv.fit(xtrain,ytrain)

In [None]:
rscv.best_params_

{'n_estimators': 300,
 'min_samples_split': 2,
 'max_depth': 6,
 'criterion': 'squared_error'}

In [None]:
best_rfr = rscv.best_estimator_
best_rfr

Random Forest model gives score results around 77%.

I am using XG Boost to check the results and see if this model improves the prediction scores

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(xtrain,ytrain)

In [None]:
model.score(xtrain,ytrain)

0.9997770430373227

In [None]:
model.score(xtest,ytest)

0.9257553167205508

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'n_estimators':[200,300,500,600,800,1000],
          'learning_rate':[0.05,0.1,0.2,0.3],
          'max_depth':[5,6,7,8,9,10],
          'min_child_weight':[1,2,3],
          'objective':['reg:squarederror'],
          'gamma':[0.1,0.2,0.3,0.4]}

In [None]:
gscv = GridSearchCV(model,params,scoring='neg_mean_squared_error',cv=5)
gscv.fit(xtrain,ytrain)

NameError: name 'GridSearchCV' is not defined

In [None]:
gscv.best_params_

In [None]:
best_xgb = gscv.best_estimator_
best_xgb

In [None]:
best_xgb.score(xtrain,ytrain)

In [None]:
best_xgb.score(xtest,ytest)

Lets check by tuning other parameters to this model

In [None]:
params1 = {'subsample':[0.5,0.6,0.7,0.8,0.9,1],
           'colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1]}

In [None]:
gscv1 = GridSearchCV(best_xgb,params1,cv=5,scoring='neg_mean_squared_error')
gscv1.fit(xtrain,ytrain)

In [None]:
gscv1.best_params_

In [None]:
best_xgb2 = gscv1.best_estimator_
best_xgb2

In [None]:
best_xgb2.score(xtrain,ytrain)

In [None]:
best_xgb2.score(xtest,ytest)

**Evaluate the models : Random Forest and XGBoost**

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
def eval_model(model,xtrain,ytrain):
    # Fit the model
    model.fit(xtrain,ytrain)
    # Predict the scores
    ypred_test = model.predict(xtest)
    # Calculate MSE,RMSE,MAE,R2 scores
    mse = mean_squared_error(ytest,ypred_test)
    rmse = mse**(1/2)
    mae = mean_absolute_error(ytest,ypred_test)
    r2 = r2_score(ytest,ypred_test)
    return mse,rmse,mae,r2

**Random Forest Evaluation Metrics**

In [None]:
(MSE,RMSE,MAE,r2) = eval_model(best_rfr,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

**XGBoost Evaluation Metrics**

In [None]:
(MSE,RMSE,MAE,r2) = eval_model(best_xgb2,xtrain,ytrain)
print(f'Evaluation Metrics: \nMSE: {MSE}\nRMSE:{RMSE}\nMAE:{MAE}\nR2:{r2}')

Therefore, XGBoost is providing best test score as compared to other models. Considering XGBoost for final prediction

**Section-2 Model Prediction**

In [None]:
ypred_test = best_xgb2.predict(xtest)
ypred_test[:10]

In [None]:
ytest.head(10)

In [None]:
df_final = xtest
df_final['Predicted_Rating'] = ypred_test
df_final

In [None]:
df_final.to_csv('Predicted Ratings.csv',index=False)