In [1]:
# import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import ridge_regression,Ridge,LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xg
import warnings
warnings.filterwarnings('ignore')
print('Setup complete')

Setup complete


In [2]:
# Load the csv file into dataframe
df = pd.read_csv('C:\MLCourse\movies.csv')
df.head(2)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0


In [3]:
# Check for any null values 
df.isnull().sum()

name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64

In [4]:
# Inorder to maintain the sanctity of the data, dropping the rows with null values
df = df.dropna(axis=0)
df = df.reset_index()

In [5]:
# Recheck for any null values
df.isnull().any().sum()

0

In [6]:
# Check the shape of the dataframe after dropping null values
df.shape

(5421, 16)

In [7]:
# Check for correlation among the features
df.corr()

Unnamed: 0,index,year,score,votes,budget,gross,runtime
index,1.0,0.999522,0.046493,0.192525,0.319194,0.264121,0.067925
year,0.999522,1.0,0.056386,0.206021,0.327722,0.274321,0.075077
score,0.046493,0.056386,1.0,0.474256,0.072001,0.222556,0.414068
votes,0.192525,0.206021,0.474256,1.0,0.439675,0.614751,0.352303
budget,0.319194,0.327722,0.072001,0.439675,1.0,0.740247,0.318695
gross,0.264121,0.274321,0.222556,0.614751,0.740247,1.0,0.275796
runtime,0.067925,0.075077,0.414068,0.352303,0.318695,0.275796,1.0


In [8]:
# Creating a new column 'id'
df['id'] = np.arange(5421)

In [9]:
# Check the datatype of features
df.dtypes

index         int64
name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
id            int32
dtype: object

In [10]:
# Perform one hot encoding on rating and genre
oh = pd.get_dummies(df[['rating','genre']])
df_new = pd.concat([df,oh],axis=1)

In [11]:
# Drop the unnecessary features
df_new = df_new.drop(['index','name','rating', 'genre','released','star',
                      'director', 'writer',  'country','company'],axis=1)

In [12]:
# Print the column names of the new dataframe
df_new.columns

Index(['year', 'score', 'votes', 'budget', 'gross', 'runtime', 'id',
       'rating_Approved', 'rating_G', 'rating_NC-17', 'rating_Not Rated',
       'rating_PG', 'rating_PG-13', 'rating_R', 'rating_TV-MA',
       'rating_Unrated', 'rating_X', 'genre_Action', 'genre_Adventure',
       'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime',
       'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Horror',
       'genre_Mystery', 'genre_Romance', 'genre_Sci-Fi', 'genre_Thriller',
       'genre_Western'],
      dtype='object')

In [13]:
# Rearrange the columns
df_new = df_new[['id','year', 'score', 'votes','budget', 'gross', 'runtime',
       'rating_Approved', 'rating_G', 'rating_NC-17', 'rating_Not Rated',
       'rating_PG', 'rating_PG-13', 'rating_R', 'rating_TV-MA',
       'rating_Unrated', 'rating_X', 'genre_Action', 'genre_Adventure',
       'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime',
       'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Horror',
       'genre_Mystery', 'genre_Romance', 'genre_Sci-Fi', 'genre_Thriller',
       'genre_Western']]

In [14]:
# Set features and target variables
X = df_new[['id','year', 'score', 'votes', 'runtime','budget',
       'rating_Approved', 'rating_G', 'rating_NC-17', 'rating_Not Rated',
       'rating_PG', 'rating_PG-13', 'rating_R', 'rating_TV-MA',
       'rating_Unrated', 'rating_X', 'genre_Action', 'genre_Adventure',
       'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime',
       'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Horror',
       'genre_Mystery', 'genre_Romance', 'genre_Sci-Fi', 'genre_Thriller',
       'genre_Western']]
y = df_new['gross']

In [15]:
# Perform standard scaling
sc = StandardScaler()
X = sc.fit_transform(X)
y = sc.fit_transform(y.values.reshape(-1,1))

In [16]:
# Perform train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30, random_state=42)

In [17]:
# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=200,criterion="mse",max_depth=10, random_state=42)
model_rf = rf.fit(X_train,y_train)
y_pred_rf = model_rf.predict(X_test)
acc_rf = r2_score(y_test,y_pred_rf)
acc_rf = acc_rf*100
print(r"Accuracy of the RandomForestRegressor model is {:.2f}%".format(acc_rf))

Accuracy of the RandomForestRegressor model is 78.12%


In [18]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(criterion='mse',max_depth=5,max_features="auto",random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = r2_score(y_test, y_pred_dt)
acc_dt = acc_dt*100
print(r"Accuracy of the DecisionTreeRegressor model is {:.2f}%".format(acc_dt))

Accuracy of the DecisionTreeRegressor model is 70.35%


In [19]:
# Ridge Regressor
model_R = Ridge(alpha=2.0,random_state=42)
model_R.fit(X_train,y_train)
y_pred_R = model_R.predict(X_test)
acc_R = r2_score(y_test,y_pred_R)
acc_R = acc_R*100
print(r"Accuracy of the RidgeRegressor model is {:.2f}%".format(acc_R))

Accuracy of the RidgeRegressor model is 67.08%


In [20]:
# XGBoost Regressor
xgb_r = xg.XGBRegressor(objective ='reg:squarederror',n_estimators = 200, learning_rate=0.1)
xgb_r.fit(X_train, y_train)
y_pred_xgbr = xgb_r.predict(X_test)
acc_xgbr = r2_score(y_test,y_pred_xgbr)
acc_xgbr = acc_xgbr*100
print(r"Accuracy of the XGBRegressor model is {:.2f}%".format(acc_xgbr))

Accuracy of the XGBRegressor model is 79.28%


For the given dataset **XGBoost Regressor** is providing the best accuracy