In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

## Data Preprocessing
* Creating Pandas Dataframes
* Checking the datatypes
* Finding Data Patterns through graphical representation 
* Finding correlations

In [None]:
mov_train = pd.read_csv('/kaggle/input/imdb-prediction-by-123-of-ai-weekend-hackathon/train_data.csv')
mov_test = pd.read_csv('/kaggle/input/imdb-prediction-by-123-of-ai-weekend-hackathon/test_data_with_inputs.csv')
print(mov_train.shape,mov_test.shape)

In [None]:
mov_train.info()

In [None]:
mov_train['genres']

In [None]:
mov_train['genres'] = mov_train['genres'].str.split('|')
mov_train['genre_1'] = mov_train['genres'].apply(lambda x: x[0])
mov_train['genre_2'] = mov_train['genres'].apply(lambda x : x[1] if len(x) > 1 else x[0])
mov_train['genre_1'],mov_train[ 'genre_2']


In [None]:
mov_train.describe()

In [None]:
sns.pairplot(mov_train[['actor_1_facebook_likes','actor_2_facebook_likes','actor_3_facebook_likes','director_facebook_likes','gross','Profit','genre_1', 'genre_2','critic_review_ratio', 'imdb_score']])
plt.show()

### ** from above bi-variate graphical analysis of numeric data - It seems lot of data has outliers and most of the univariate graphs show the spread is not normal but skewed. Removing out liers may help showing some patterns

Also from the describe function we can see for some variables the difference between  mean to 75 percentile and that of 75 percentile to  Max is high. possible outliers

In [None]:
plt.figure(figsize = (10, 2))
sns.set_style('whitegrid')
sns.histplot(mov_train['director_facebook_likes'], kde = True, color ='red')
plt.show()

In [None]:
plt.figure(figsize = (10, 2))
sns.set_style('whitegrid')
sns.histplot(mov_train['actor_1_facebook_likes'], kde = True, color ='red')
#sns.pairplot(mov_train[['actor_1_facebook_likes','imdb_score']])
plt.show()

#### More than 50% actors dont have facebook likes - probably dont have facebook accounts but when lloking at the bi variate plot with IMDB SCORE many zero rating actors and directors have very high scores on IMDB

## Understanding Object type variables

In [None]:
mov_train.describe(include = [np.object])

#### Language & has most of the data concentrated on one or two categories only. Counts to be looked closely.

In [None]:
plt.figure(figsize = (10, 2))
sns.countplot(x ='language', data = mov_train)
plt.show()
mov_train[["language"]].value_counts()

In [None]:
plt.figure(figsize = (10, 2))
sns.countplot(x ='country', data = mov_train)
plt.show()
mov_train[["country"]].value_counts()

In [None]:
plt.figure(figsize = (20, 2))
sns.countplot(x ='genre_1', data = mov_train)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))
plt.subplot(5,1,1)
sns.boxplot(x = 'language', y = 'imdb_score', data = mov_train)
plt.subplot(5,1,2)
sns.boxplot(x = 'country', y = 'imdb_score', data = mov_train)
plt.subplot(5,1,3)
sns.boxplot(x = 'genre_1', y = 'imdb_score', data = mov_train)
plt.subplot(5,1,4)
sns.boxplot(x = 'genre_2', y = 'imdb_score', data = mov_train)
plt.subplot(5,1,5)
sns.boxplot(x = 'content_rating', y = 'imdb_score', data = mov_train)
plt.show()

### From above Bi-variate analysis of some object variables of probable higher significance shows Genres and Content_rating certainly has a less categories and the data is well spread among vrious values and I believe this make these variables more valuable in defining the data.

In [None]:
plt.figure(figsize = (20, 5))
sns.boxplot(x = 'content_rating', y = 'imdb_score', hue = 'genre_1', data = mov_train)
plt.show()

In [None]:
plt.figure(figsize = (20, 5))
sns.boxplot(x = 'content_rating', y = 'imdb_score', hue = 'genre_2', data = mov_train)
plt.show()

In [None]:
plt.figure(figsize = (20, 2))
sns.countplot(x ='genre_2', data = mov_train)
plt.show()

In [None]:
plt.figure(figsize = (20, 2))
sns.countplot(x ='genre_1', data = mov_train)
plt.show()

# ---------------------------------------------------------------------------------------------------------------

## Feature Engineering
* Convert Dummy Variables for Language, Country, Genre & Content Rating

In [None]:
d = {'English' : 'lang_English', 'French' : 'lang_Other', 'Spanish' : 'lang_Other', 'Mandarin' : 'lang_Other', 'Hindi' : 'lang_Other', 'German' : 'lang_Other', 'Japanese' : 'lang_Other', 'Russian' : 'lang_Other', 'Cantonese' : 'lang_Other', 'Italian' : 'lang_Other', 'Korean' : 'lang_Other', 'Portuguese' : 'lang_Other', 'Danish' : 'lang_Other', 'Persian' : 'lang_Other', 'Norwegian' : 'lang_Other', 'Dutch' : 'lang_Other', 'Hebrew' : 'lang_Other', 'Arabic' : 'lang_Other', 'Swedish' : 'lang_Other', 'Thai' : 'lang_Other', 'Aboriginal' : 'lang_Other', 'Dari' : 'lang_Other', 'Zulu' : 'lang_Other', 'Kazakh' : 'lang_Other', 'Indonesian' : 'lang_Other', 'Maya' : 'lang_Other', 'Mongolian' : 'lang_Other', 'Icelandic' : 'lang_Other', 'Hungarian' : 'lang_Other', 'Polish' : 'lang_Other', 'Greek' : 'lang_Other', 'Romanian' : 'lang_Other', 'Dzongkha' : 'lang_Other', 'Czech' : 'lang_Other', 'Telugu' : 'lang_Other', 'Bosnian' : 'lang_Other', 'Vietnamese' : 'lang_Other'}
mov_train[['language']] = mov_train[['language']].apply(lambda x: x.map(d))
lang=pd.get_dummies(mov_train['language'])
lang.drop('lang_Other', axis = 1, inplace = True)

lang[['lang_English']] = lang[['lang_English']].apply(lambda x: x.map({True:1, False:0}))
lang.head()

In [None]:
mov_train = pd.concat([mov_train, lang], axis = 1)
mov_train.drop(['language'], axis = 1, inplace = True)
mov_train.head()

In [None]:
d = {'USA' : 'country_USA', 'UK' : 'country_UK', 'France' : 'country_FR', 'Canada' : 'country_Other', 'Germany' : 'country_Other', 'Australia' : 'country_Other', 'Spain' : 'country_Other', 'India' : 'country_Other', 'China' : 'country_Other', 'Italy' : 'country_Other', 'Japan' : 'country_Other', 'Hong Kong' : 'country_Other', 'New Zealand' : 'country_Other', 'South Korea' : 'country_Other', 'Mexico' : 'country_Other', 'Denmark' : 'country_Other', 'Russia' : 'country_Other', 'Ireland' : 'country_Other', 'Brazil' : 'country_Other', 'South Africa' : 'country_Other', 'Norway' : 'country_Other', 'Netherlands' : 'country_Other', 'Sweden' : 'country_Other', 'Switzerland' : 'country_Other', 'Thailand' : 'country_Other', 'West Germany' : 'country_Other', 'Iran' : 'country_Other', 'Czech Republic' : 'country_Other', 'Israel' : 'country_Other', 'Iceland' : 'country_Other', 'Belgium' : 'country_Other', 'Romania' : 'country_Other', 'Argentina' : 'country_Other', 'Finland' : 'country_Other', 'Bahamas' : 'country_Other', 'Turkey' : 'country_Other', 'Bulgaria' : 'country_Other', 'Taiwan' : 'country_Other', 'Cambodia' : 'country_Other', 'Chile' : 'country_Other', 'Colombia' : 'country_Other', 'Soviet Union' : 'country_Other', 'Georgia' : 'country_Other', 'Slovakia' : 'country_Other', 'Poland' : 'country_Other', 'Philippines' : 'country_Other', 'Peru' : 'country_Other', 'Official site' : 'country_Other', 'Greece' : 'country_Other', 'Nigeria' : 'country_Other', 'Hungary' : 'country_Other', 'New Line' : 'country_Other', 'Libya' : 'country_Other', 'Afghanistan' : 'country_Other'}
mov_train[['country']] = mov_train[['country']].apply(lambda x: x.map(d))
country=pd.get_dummies(mov_train['country'])
country.drop('country_Other', axis = 1, inplace = True)
country[['country_USA']] = country[['country_USA']].apply(lambda x: x.map({True:1, False:0}))
country[['country_UK']] = country[['country_UK']].apply(lambda x: x.map({True:1, False:0}))
country[['country_FR']] = country[['country_FR']].apply(lambda x: x.map({True:1, False:0}))

mov_train = pd.concat([mov_train, country], axis = 1)
mov_train.drop(['country'], axis = 1, inplace = True)
mov_train.head()

In [None]:
d={'Drama' : 'genre_Drama', 'Comedy' : 'genre_Comedy', 'Action' : 'genre_Action', 'Adventure' : 'genre_Adventure', 'Crime' : 'genre_Crime', 'Horror' : 'genre_Horror', 'Romance' : 'genre_Romance', 'Biography' : 'genre_other', 'Thriller' : 'genre_other', 'Mystery' : 'genre_other', 'Fantasy' : 'genre_other', 'Animation' : 'genre_other', 'Family' : 'genre_other', 'Sci-Fi' : 'genre_other', 'Documentary' : 'genre_other', 'Music' : 'genre_other', 'History' : 'genre_other', 'Sport' : 'genre_other', 'War' : 'genre_other', 'Western' : 'genre_other', 'Musical' : 'genre_other', 'Film-Noir' : 'genre_other', 'News' : 'genre_other'}
mov_train[['genre_1']] = mov_train[['genre_1']].apply(lambda x: x.map(d))
genre=pd.get_dummies(mov_train['genre_1'])
genre.drop('genre_other', axis = 1, inplace = True)
genre[['genre_Drama']] = genre[['genre_Drama']].apply(lambda x: x.map({True:1, False:0}))
genre[['genre_Comedy']] = genre[['genre_Comedy']].apply(lambda x: x.map({True:1, False:0}))
genre[['genre_Action']] = genre[['genre_Action']].apply(lambda x: x.map({True:1, False:0}))
genre[['genre_Adventure']] = genre[['genre_Adventure']].apply(lambda x: x.map({True:1, False:0}))
genre[['genre_Crime']] = genre[['genre_Crime']].apply(lambda x: x.map({True:1, False:0}))
genre[['genre_Horror']] = genre[['genre_Horror']].apply(lambda x: x.map({True:1, False:0}))
genre[['genre_Romance']] = genre[['genre_Romance']].apply(lambda x: x.map({True:1, False:0}))
mov_train = pd.concat([mov_train, genre], axis = 1)
mov_train.drop(['genre_1'], axis = 1, inplace = True)
mov_train.drop(['genre_2'], axis = 1, inplace = True)
mov_train.head()

In [None]:
d={'R' : 'contentRate_R', 'PG-13' : 'contentRate_PG-13', 'PG' : 'contentRate_PG', 'G' : 'contentRate_other', 'Not Rated' : 'contentRate_other', 'Unrated' : 'contentRate_other', 'Approved' : 'contentRate_other', 'X' : 'contentRate_other', 'NC-17' : 'contentRate_other', 'GP' : 'contentRate_other', 'Passed' : 'contentRate_other', 'M' : 'contentRate_other', 'TV-14' : 'contentRate_other', 'TV-G' : 'contentRate_other', 'TV-PG' : 'contentRate_other'}
mov_train[['content_rating']] = mov_train[['content_rating']].apply(lambda x: x.map(d))
cnt_rt=pd.get_dummies(mov_train['content_rating'])
cnt_rt.drop('contentRate_other', axis = 1, inplace = True)
cnt_rt[['contentRate_R']] = cnt_rt[['contentRate_R']].apply(lambda x: x.map({True:1, False:0}))
cnt_rt[['contentRate_PG-13']] = cnt_rt[['contentRate_PG-13']].apply(lambda x: x.map({True:1, False:0}))
cnt_rt[['contentRate_PG']] = cnt_rt[['contentRate_PG']].apply(lambda x: x.map({True:1, False:0}))
mov_train = pd.concat([mov_train, cnt_rt], axis = 1)
mov_train.drop(['content_rating'], axis = 1, inplace = True)
mov_train.head()

#### Lets drop some variblaes which we havent convertedfrom Object & doesnt look of much significance

In [None]:
mov_train.drop(['director_name','actor_3_name','actor_2_name', 'genres','actor_1_name', 'movie_title', 'facenumber_in_poster','plot_keywords'], axis = 1, inplace = True)

In [None]:
mov_train = pd.DataFrame(mov_train).set_index('s_no')

## Replicating changes in Test Matrix

In [None]:
d = {'English' : 'lang_English', 'French' : 'lang_Other', 'Spanish' : 'lang_Other', 'Mandarin' : 'lang_Other', 'Hindi' : 'lang_Other', 'German' : 'lang_Other', 'Japanese' : 'lang_Other', 'Russian' : 'lang_Other', 'Cantonese' : 'lang_Other', 'Italian' : 'lang_Other', 'Korean' : 'lang_Other', 'Portuguese' : 'lang_Other', 'Danish' : 'lang_Other', 'Persian' : 'lang_Other', 'Norwegian' : 'lang_Other', 'Dutch' : 'lang_Other', 'Hebrew' : 'lang_Other', 'Arabic' : 'lang_Other', 'Swedish' : 'lang_Other', 'Thai' : 'lang_Other', 'Aboriginal' : 'lang_Other', 'Dari' : 'lang_Other', 'Zulu' : 'lang_Other', 'Kazakh' : 'lang_Other', 'Indonesian' : 'lang_Other', 'Maya' : 'lang_Other', 'Mongolian' : 'lang_Other', 'Icelandic' : 'lang_Other', 'Hungarian' : 'lang_Other', 'Polish' : 'lang_Other', 'Greek' : 'lang_Other', 'Romanian' : 'lang_Other', 'Dzongkha' : 'lang_Other', 'Czech' : 'lang_Other', 'Telugu' : 'lang_Other', 'Bosnian' : 'lang_Other', 'Vietnamese' : 'lang_Other'}
mov_test[['language']] = mov_test[['language']].apply(lambda x: x.map(d))
lang=pd.get_dummies(mov_test['language'])
lang.drop('lang_Other', axis = 1, inplace = True)
mov_test = pd.concat([mov_test, lang], axis = 1)
mov_test.drop(['language'], axis = 1, inplace = True)
mov_test.head()

In [None]:
d = {'USA' : 'country_USA', 'UK' : 'country_UK', 'France' : 'country_FR', 'Canada' : 'country_Other', 'Germany' : 'country_Other', 'Australia' : 'country_Other', 'Spain' : 'country_Other', 'India' : 'country_Other', 'China' : 'country_Other', 'Italy' : 'country_Other', 'Japan' : 'country_Other', 'Hong Kong' : 'country_Other', 'New Zealand' : 'country_Other', 'South Korea' : 'country_Other', 'Mexico' : 'country_Other', 'Denmark' : 'country_Other', 'Russia' : 'country_Other', 'Ireland' : 'country_Other', 'Brazil' : 'country_Other', 'South Africa' : 'country_Other', 'Norway' : 'country_Other', 'Netherlands' : 'country_Other', 'Sweden' : 'country_Other', 'Switzerland' : 'country_Other', 'Thailand' : 'country_Other', 'West Germany' : 'country_Other', 'Iran' : 'country_Other', 'Czech Republic' : 'country_Other', 'Israel' : 'country_Other', 'Iceland' : 'country_Other', 'Belgium' : 'country_Other', 'Romania' : 'country_Other', 'Argentina' : 'country_Other', 'Finland' : 'country_Other', 'Bahamas' : 'country_Other', 'Turkey' : 'country_Other', 'Bulgaria' : 'country_Other', 'Taiwan' : 'country_Other', 'Cambodia' : 'country_Other', 'Chile' : 'country_Other', 'Colombia' : 'country_Other', 'Soviet Union' : 'country_Other', 'Georgia' : 'country_Other', 'Slovakia' : 'country_Other', 'Poland' : 'country_Other', 'Philippines' : 'country_Other', 'Peru' : 'country_Other', 'Official site' : 'country_Other', 'Greece' : 'country_Other', 'Nigeria' : 'country_Other', 'Hungary' : 'country_Other', 'New Line' : 'country_Other', 'Libya' : 'country_Other', 'Afghanistan' : 'country_Other'}
mov_test[['country']] = mov_test[['country']].apply(lambda x: x.map(d))
country=pd.get_dummies(mov_test['country'])
country.drop('country_Other', axis = 1, inplace = True)
mov_test = pd.concat([mov_test, country], axis = 1)
mov_test.drop(['country'], axis = 1, inplace = True)
mov_test.head()

In [None]:
mov_test['genres'] = mov_test['genres'].str.split('|')
mov_test['genre_1'] = mov_test['genres'].apply(lambda x: x[0])
mov_test['genre_2'] = mov_test['genres'].apply(lambda x : x[1] if len(x) > 1 else x[0])
mov_test['genre_1'],mov_test[ 'genre_2']


In [None]:
d={'Drama' : 'genre_Drama', 'Comedy' : 'genre_Comedy', 'Action' : 'genre_Action', 'Adventure' : 'genre_Adventure', 'Crime' : 'genre_Crime', 'Horror' : 'genre_Horror', 'Romance' : 'genre_Romance', 'Biography' : 'genre_other', 'Thriller' : 'genre_other', 'Mystery' : 'genre_other', 'Fantasy' : 'genre_other', 'Animation' : 'genre_other', 'Family' : 'genre_other', 'Sci-Fi' : 'genre_other', 'Documentary' : 'genre_other', 'Music' : 'genre_other', 'History' : 'genre_other', 'Sport' : 'genre_other', 'War' : 'genre_other', 'Western' : 'genre_other', 'Musical' : 'genre_other', 'Film-Noir' : 'genre_other', 'News' : 'genre_other'}
mov_test[['genre_1']] = mov_test[['genre_1']].apply(lambda x: x.map(d))
genre=pd.get_dummies(mov_test['genre_1'])
genre.drop('genre_other', axis = 1, inplace = True)
mov_test = pd.concat([mov_test, genre], axis = 1)
mov_test.drop(['genre_1'], axis = 1, inplace = True)
mov_test.drop(['genre_2'], axis = 1, inplace = True)
mov_test.head()

In [None]:
d={'R' : 'contentRate_R', 'PG-13' : 'contentRate_PG-13', 'PG' : 'contentRate_PG', 'G' : 'contentRate_other', 'Not Rated' : 'contentRate_other', 'Unrated' : 'contentRate_other', 'Approved' : 'contentRate_other', 'X' : 'contentRate_other', 'NC-17' : 'contentRate_other', 'GP' : 'contentRate_other', 'Passed' : 'contentRate_other', 'M' : 'contentRate_other', 'TV-14' : 'contentRate_other', 'TV-G' : 'contentRate_other', 'TV-PG' : 'contentRate_other'}
mov_test[['content_rating']] = mov_test[['content_rating']].apply(lambda x: x.map(d))
cnt_rt=pd.get_dummies(mov_test['content_rating'])
cnt_rt.drop('contentRate_other', axis = 1, inplace = True)
mov_test = pd.concat([mov_test, cnt_rt], axis = 1)
mov_test.drop(['content_rating'], axis = 1, inplace = True)
mov_test.head()

In [None]:
mov_test.drop(['director_name','actor_3_name','actor_2_name', 'actor_1_name', 'movie_title', 'num_voted_users','genres', 'facenumber_in_poster','plot_keywords'], axis = 1, inplace = True)

In [None]:
mov_test.info()

In [None]:
mov_train.info()

# ---------------------------------------------------------------------------------------------------------------------

## Setting up Train Test Sets & Model Creation

In [None]:
np.random.seed(0)
df_train, df_test = train_test_split(mov_train, train_size = 0.7, test_size = 0.3, random_state = 100)
df_train.head()

In [None]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Apply scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['director_facebook_likes', 'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross', 'budget', 'title_year', 'actor_2_facebook_likes', 'aspect_ratio', 'movie_facebook_likes', 'Profit', 'Profit_Percentage', 'critic_review_ratio']

# scaler fit & transform both are done over the train set. 
# specifically the fit is used on test and that is used as fitting standard so as to keep  us from overfit issues
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()

In [None]:
df_train.info()

In [None]:
plt.figure(figsize = (25, 10))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

### Above Heatmap is depicting 
* High Correlation between 
    1. Budget & Profit
    2. cast_total_facebook_likes & actor_1_facebook_likes ---> this is quite possible as one data is derived from other
* Considerable reverse correlation 
    1. between gross & num_voted_users
    2. between movie_facebook_likes & num_voted_users
    3. cast_total_facebook_likes & actor_2_facebook_likes ---> this is quite possible as one data is derived from other
    
    
*  imdb_score also has clear correlation with  duration & num_Voted_users 

In [None]:
y_train = df_train.pop('imdb_score')
x_train = df_train


In [None]:
print(x_train.shape,y_train.shape)
#print(x_test.shape,y_test.shape)

# Model Creation 

- Using REF - Automted method to create initial model by 
    - automatic assessing and reducing the features set to use most significant features and 
    - giving significance levels for rest of the features 
    
- Using manual method to create subsequent models by fine tuning based on 
     - assesing the value changes in R2 & R2 adjusted (should be a positive trend) with individual feature addition
     - assesing the value of VIF for all features stating the corelation between independent features/variables(anything oving above 5 should be checked for)

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Making RFE based auto model
lm = LinearRegression()
lm.fit(x_train, y_train)
rfe = RFE(lm, n_features_to_select=9)             
# running RFE with final 10 variable selection in output 
rfe = rfe.fit(x_train, y_train)

In [None]:
list(zip(x_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = x_train.columns[rfe.support_]
col

In [None]:
x_train.columns[~rfe.support_]

In [None]:
x_train_rfe = x_train[col]

In [None]:
x_train_rfe.head()

In [None]:
import statsmodels.api as sm  
x_train_rfe2 = sm.add_constant(x_train_rfe)

In [None]:
x_train_rfe2.head()


In [None]:
lm = sm.OLS(y_train,x_train_rfe2).fit()   # Running the linear model

In [None]:
print(lm.summary())

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = x_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(x_train_rfe.values, i) for i in range(x_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
#Dropping feature 'lang_English' & Creating a new model
x_train_new = x_train_rfe.drop('lang_English',axis = 1, inplace = False) 
x_train_lm2 = sm.add_constant(x_train_new)
lm2 = sm.OLS(y_train,x_train_lm2).fit()   # Running the linear model
print(lm2.summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = x_train_new.columns
vif['VIF'] = [variance_inflation_factor(x_train_new.values, i) for i in range(x_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
num_vars = ['director_facebook_likes', 'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross', 'budget', 'title_year', 'actor_2_facebook_likes', 'aspect_ratio', 'movie_facebook_likes', 'Profit', 'Profit_Percentage', 'critic_review_ratio']
df_test[num_vars] = scaler.transform(df_test[num_vars])


In [None]:
y_test = df_test.pop('imdb_score')
x_test = df_test

In [None]:
X_test_m2 = sm.add_constant(x_test)

In [None]:
X_test_m2 = X_test_m2.drop(['duration', 'num_voted_users', 'cast_total_facebook_likes', 'budget',
       'aspect_ratio', 'Profit', 'Profit_Percentage', 'country_FR',
       'country_UK', 'country_USA', 'genre_Action', 'genre_Adventure',
       'genre_Comedy', 'genre_Crime', 'genre_Drama', 'genre_Horror',
       'genre_Romance', 'contentRate_PG', 'contentRate_PG-13',
       'contentRate_R','lang_English'], axis = 1)

In [None]:
y_pred_m2 = lm2.predict(X_test_m2)

In [None]:
r2_score(y_true = y_test, y_pred = y_pred_m2)

In [None]:
y_pred_m2

## -------------- Now running prediction on the Input Test Data 

In [None]:
mov_test.head()

In [None]:
x_test_given = mov_test.iloc[ : ,1:28]
y_pred1 = mov_test.iloc[ : ,0:1]
print(x_test_given.shape,y_pred1.shape)

In [None]:
num_vars = ['director_facebook_likes', 'actor_3_facebook_likes', 'actor_1_facebook_likes', 'gross', 'budget', 'title_year', 'actor_2_facebook_likes', 'aspect_ratio', 'movie_facebook_likes', 'Profit', 'Profit_Percentage', 'critic_review_ratio']
x_test_given[num_vars] = scaler.transform(x_test_given[num_vars])


In [None]:
x_test_given = x_test_given.drop(['duration', 'cast_total_facebook_likes', 'budget',
       'aspect_ratio', 'Profit', 'Profit_Percentage', 'country_FR',
       'country_UK', 'country_USA', 'genre_Action', 'genre_Adventure',
       'genre_Comedy', 'genre_Crime', 'genre_Drama', 'genre_Horror', 'contentRate_PG', 'contentRate_PG-13',
       'contentRate_R','lang_English'], axis = 1)

In [None]:
x_test_given = sm.add_constant(x_test_given)

In [None]:
y_pred1["imdb_score"] = lm2.predict(x_test_given)  

In [None]:
y_pred1

In [None]:
submission = pd.DataFrame(y_pred1).set_index('s_no')
submission.to_csv('output_submission.csv')