In [100]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [101]:
imdb = pd.read_csv("IMDB_Final.csv")

In [102]:
imdb.head()

Unnamed: 0.1,Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,...,History,Sci-Fi,Action,Fantasy,Adventure,Sport,positive_words,negative_words,verbs,stopwords
0,0,The Shawshank Redemption,1994,1,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,0,...,0,0,0,0,0,0,4,0,3,7
1,1,The Godfather,1972,1,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,0,...,0,0,0,0,0,0,0,2,1,6
2,2,The Dark Knight,2008,14,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,1,...,0,0,1,0,0,0,1,7,5,15
3,3,The Godfather: Part II,1974,1,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,0,...,0,0,0,0,0,0,0,1,5,13
4,4,12 Angry Men,1957,12,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,0,...,0,0,0,0,0,0,0,1,3,8


In [103]:
#removing non numerical columns since we already extracted NLP features
new_df = imdb.loc[:, ~imdb.columns.isin(['Series_Title', 'Genre', 'Overview'])]

In [104]:
new_df.head(10)

Unnamed: 0.1,Unnamed: 0,Released_Year,Certificate,Runtime,IMDB_Rating,Meta_score,Director,Musical,Comedy,Horror,...,History,Sci-Fi,Action,Fantasy,Adventure,Sport,positive_words,negative_words,verbs,stopwords
0,0,1994,1,142,9.3,80.0,0,0,0,0,...,0,0,0,0,0,0,4,0,3,7
1,1,1972,1,175,9.2,100.0,0,0,0,0,...,0,0,0,0,0,0,0,2,1,6
2,2,2008,14,152,9.0,84.0,1,0,0,0,...,0,0,1,0,0,0,1,7,5,15
3,3,1974,1,202,9.0,90.0,0,0,0,0,...,0,0,0,0,0,0,0,1,5,13
4,4,1957,12,96,9.0,96.0,0,0,0,0,...,0,0,0,0,0,0,0,1,3,8
5,5,2003,12,201,8.9,94.0,0,0,0,0,...,0,0,1,0,1,0,1,1,9,18
6,6,1994,1,154,8.9,94.0,0,0,0,0,...,0,0,0,0,0,0,1,2,4,12
7,7,1993,1,195,8.9,94.0,1,0,0,0,...,1,0,0,0,0,0,0,2,1,8
8,8,2010,14,148,8.8,74.0,1,0,0,0,...,0,1,1,0,1,0,0,1,3,13
9,9,1999,1,139,8.8,66.0,1,0,0,0,...,0,0,0,0,0,0,0,0,5,7


In [105]:
X = new_df.loc[:, new_df.columns!='IMDB_Rating']
Y = new_df['IMDB_Rating']

In [106]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

# XGBoost

In [107]:
from xgboost import XGBRegressor

In [108]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

### Predicting on Test Set

In [109]:
pred_xgb = xgb.predict(X_test)

In [121]:
#calculating and printing mean square error
print("Mean Square error: " , mean_squared_error(y_test, pred_xgb, squared=False))

Mean Square error:  0.01100226761659747


In [123]:
#calculating and printing r-squared score
print("R_squared score :", r2_score(y_test, pred_xgb))

R_squared score : 0.9981810170267215


In [124]:
xgb.score(X_test,y_test)

0.9981810170267215

In [125]:
from sklearn.metrics import explained_variance_score
print(explained_variance_score(pred_xgb,y_test))

0.9981892865545787


### Prediciting on Training Set

In [118]:
pred_xgb1 = xgb.predict(X_train)

In [119]:
mean_squared_error(y_train, pred_xgb1, squared=False)

0.00017183813074128442

In [126]:
r2_score(y_train, pred_xgb1)

0.9999996304948437

### MLP - SKLearn implementation

In [115]:
from sklearn.neural_network import MLPRegressor

In [116]:
regr = MLPRegressor(random_state=101,hidden_layer_sizes = (100,10), max_iter=500, activation = "relu").fit(X_train, y_train)
pred_mlp = regr.predict(X_test)

0.45027736144076397

In [127]:
mean_squared_error(y_test, pred_mlp, squared=False)

0.45027736144076397