<a href="https://colab.research.google.com/github/NIkson14/Study/blob/main/Nikita/Copy_of_movie_success_prediction_system_nikita.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Success Prediction System
The main aim of this project is to predict the potential success or a failure of a movie based on wide range of features.

Our main dataset comes from IMDb itself which gives us open source access to its non-commercial dataset at

https://developer.imdb.com/non-commercial-datasets/

In this project we will be going through every step of the Machine Learning Pipeline, from the loading and cleaning up of our dataset to training and tuning our model to make predictions on new data.

## Data Preparation

Our data here comes directly from IMDb itself which consists of movie information from as early as the 1800s to upcoming movies in the near future.

We have made a separate script to create a dataset from 6 of the 7 datasets presented by IMDb. We now focus on loading up our datasets and further cleaning it up for use with our Machine Learning models.

In [1]:
import warnings, requests, gzip
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *


from joblib import Parallel, delayed
from tqdm import tqdm
from math import floor, ceil
import os, pickle

from sklearn.cluster import KMeans
from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie_dataset = pd.read_csv("/content/drive/MyDrive/drive-download-20231202T144505Z-001/imdb_movie_dataset.csv")

From the data that was created, we now drop the 'Production_designer' column as more the 400k were null values. We also imputed the values in the 'Short' column based on runtime as the general standard of a short movie is upto 50 minutes. Next we remove the '\N' string from regions and split it into multiple columns

In [4]:
movie_dataset.drop('production_designer',axis=1, inplace=True)
movie_dataset['Short'] = np.where(movie_dataset['runtimeMinutes']>50, 0, 1)
movie_dataset['region'] = movie_dataset['region'].convert_dtypes(convert_string=True)
movie_dataset['region'] = movie_dataset['region'].str.replace(r"\N," ,'',regex=False)
movie_dataset['region'] = movie_dataset['region'].str.replace(r",\N" ,'',regex=False)
movie_dataset

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,...,actress,cinematographer,composer,director,editor,producer,self,writer,averageRating,numVotes
0,0,tt0000502,Bohemios,False,1905,100,0,0,0,0,...,,,,nm0063413,,,,"nm0675388,nm0063413,nm0657268",4.1,15
1,1,tt0000574,The Story of the Kelly Gang,False,1906,70,1,0,1,0,...,nm0846887,nm0675239,nm2421834,nm0846879,,"nm0317210,nm0425854,nm0846911",,nm0846879,6.0,855
2,2,tt0000591,The Prodigal Son,False,1907,90,0,0,0,0,...,"nm1323543,nm1759558",,,nm0141150,,,,nm0141150,5.0,21
3,3,tt0000615,Robbery Under Arms,False,1907,0,0,0,0,0,...,nm0218953,nm0167619,,nm0533958,,,,"nm0533958,nm0092809",4.3,25
4,4,tt0000630,Hamlet,False,1908,0,0,0,0,0,...,nm0624446,,,nm0143333,,nm0209738,,nm0000636,2.9,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440879,570470,tt9916538,Kuambil Lagi Hatiku,False,2019,123,0,0,0,0,...,"nm8678236,nm1417182,nm1266058",,nm4700236,nm4457074,,nm1290982,,"nm4900525,nm4843252,nm2679404",8.6,7
440880,570471,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,False,2015,57,0,0,0,0,...,,"nm9272492,nm9272489,nm8349149,nm9275317",,"nm9272491,nm9272490",,,"nm10538557,nm10538558,nm10538556","nm9272491,nm9272490",0.0,0
440881,570472,tt9916680,De la ilusión al desconcierto: cine colombiano...,False,2007,100,0,0,0,0,...,,"nm10538579,nm10538578,nm10538577",,nm0652213,nm4762061,,"nm0033355,nm0127882,nm0133349,nm10503634","nm0652213,nm10538576",0.0,0
440882,570474,tt9916730,6 Gunn,False,2017,116,0,0,0,0,...,,nm1957275,,nm10538612,nm9785908,"nm10538614,nm10538613",,nm10538612,7.6,11


In [5]:
movie_dataset["region"].fillna("", inplace=True)
movie_dataset["region"].isna().sum()

0

In [6]:
def check_region(region, target):
    try:
        return 1 if target in region else 0
    except TypeError:
        return 0

In [7]:
movie_dataset['region_US'] = movie_dataset["region"].apply(lambda x: check_region(x, 'US'))
movie_dataset['region_UK'] = movie_dataset["region"].apply(lambda x: check_region(x, 'UK'))
movie_dataset['region_AU'] = movie_dataset["region"].apply(lambda x: check_region(x, 'AU'))
movie_dataset['region_IN'] = movie_dataset["region"].apply(lambda x: check_region(x, 'IN'))
movie_dataset['region_JP'] = movie_dataset["region"].apply(lambda x: check_region(x, 'JP'))
movie_dataset['region_other'] = movie_dataset['region'].apply(lambda x: any(e not in ['US', 'UK','AU','IN','JP'] for e in x)).astype(int)
movie_dataset

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,...,self,writer,averageRating,numVotes,region_US,region_UK,region_AU,region_IN,region_JP,region_other
0,0,tt0000502,Bohemios,False,1905,100,0,0,0,0,...,,"nm0675388,nm0063413,nm0657268",4.1,15,0,0,0,0,0,1
1,1,tt0000574,The Story of the Kelly Gang,False,1906,70,1,0,1,0,...,,nm0846879,6.0,855,1,0,1,0,0,1
2,2,tt0000591,The Prodigal Son,False,1907,90,0,0,0,0,...,,nm0141150,5.0,21,1,0,0,0,0,1
3,3,tt0000615,Robbery Under Arms,False,1907,0,0,0,0,0,...,,"nm0533958,nm0092809",4.3,25,0,0,1,0,0,1
4,4,tt0000630,Hamlet,False,1908,0,0,0,0,0,...,,nm0000636,2.9,27,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440879,570470,tt9916538,Kuambil Lagi Hatiku,False,2019,123,0,0,0,0,...,,"nm4900525,nm4843252,nm2679404",8.6,7,0,0,0,0,0,1
440880,570471,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,False,2015,57,0,0,0,0,...,"nm10538557,nm10538558,nm10538556","nm9272491,nm9272490",0.0,0,0,0,0,0,0,1
440881,570472,tt9916680,De la ilusión al desconcierto: cine colombiano...,False,2007,100,0,0,0,0,...,"nm0033355,nm0127882,nm0133349,nm10503634","nm0652213,nm10538576",0.0,0,0,0,0,0,0,1
440882,570474,tt9916730,6 Gunn,False,2017,116,0,0,0,0,...,,nm10538612,7.6,11,0,0,0,1,0,1


In [8]:
# movie_dataset.isna().sum()
movie_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440884 entries, 0 to 440883
Data columns (total 53 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       440884 non-null  int64  
 1   tconst           440884 non-null  object 
 2   primaryTitle     440884 non-null  object 
 3   isAdult          440884 non-null  bool   
 4   releaseYear      440884 non-null  int64  
 5   runtimeMinutes   440884 non-null  int64  
 6   Action           440884 non-null  int64  
 7   Adult            440884 non-null  int64  
 8   Adventure        440884 non-null  int64  
 9   Animation        440884 non-null  int64  
 10  Biography        440884 non-null  int64  
 11  Comedy           440884 non-null  int64  
 12  Crime            440884 non-null  int64  
 13  Documentary      440884 non-null  int64  
 14  Drama            440884 non-null  int64  
 15  Family           440884 non-null  int64  
 16  Fantasy          440884 non-null  int6

Next we need to split the actors into multiple columns. We also need to do this for actresses, writer, self, producer, composer, etc.

In [9]:
actor_split= movie_dataset['actor'].str.split(',', expand=True)
actor_split.rename(columns={0: 'actors_1', 1: 'actors_2', 2: 'actors_3', 3: 'actors_4', 4: 'actors_5', 5: 'actors_6',6: 'actors_7', 7: 'actors_8', 8: 'actors_9',9: 'actors_10'}, inplace=True)
actor_split.drop(['actors_6','actors_7','actors_8','actors_9','actors_10'], axis =1, inplace =True)
actor_split

Unnamed: 0,actors_1,actors_2,actors_3,actors_4,actors_5
0,nm0215752,nm0252720,,,
1,nm0846894,nm1431224,nm3002376,,
2,nm0906197,nm0332182,,,
3,nm3071427,nm0581353,nm0888988,nm0240418,nm0346387
4,,,,,
...,...,...,...,...,...
440879,nm10041459,,,,
440880,nm9272513,,,,
440881,,,,,
440882,nm6096005,nm0059461,nm13233318,nm4852679,


In [10]:
actress_split= movie_dataset['actress'].str.split(',', expand=True)
actress_split.rename(columns={0: 'actress_1', 1: 'actress_2', 2: 'actress_3', 3: 'actress_4', 4: 'actress_5', 5: 'actress_6',6: 'actress_7', 7: 'actress_8', 8: 'actress_9',9: 'actress_10'}, inplace=True)
actress_split.drop(['actress_6','actress_7','actress_8','actress_9','actress_10'], axis =1, inplace =True)
actress_split

Unnamed: 0,actress_1,actress_2,actress_3,actress_4,actress_5
0,,,,,
1,nm0846887,,,,
2,nm1323543,nm1759558,,,
3,nm0218953,,,,
4,nm0624446,,,,
...,...,...,...,...,...
440879,nm8678236,nm1417182,nm1266058,,
440880,,,,,
440881,,,,,
440882,,,,,


In [11]:
writer_split= movie_dataset['writer'].str.split(',', expand=True)
writer_split = writer_split.iloc[:, :3]
writer_split.rename(columns={0: 'writer_1', 1: 'writer_2', 2: 'writer_3'}, inplace=True)
writer_split

Unnamed: 0,writer_1,writer_2,writer_3
0,nm0675388,nm0063413,nm0657268
1,nm0846879,,
2,nm0141150,,
3,nm0533958,nm0092809,
4,nm0000636,,
...,...,...,...
440879,nm4900525,nm4843252,nm2679404
440880,nm9272491,nm9272490,
440881,nm0652213,nm10538576,
440882,nm10538612,,


In [12]:
cinema_split= movie_dataset['cinematographer'].str.split(',', expand=True)
cinema_split = cinema_split .iloc[:, :3]
cinema_split .rename(columns={0: 'cinematographer_1', 1: 'cinematographer_2', 2: 'cinematographer_3'}, inplace=True)
cinema_split

Unnamed: 0,cinematographer_1,cinematographer_2,cinematographer_3
0,,,
1,nm0675239,,
2,,,
3,nm0167619,,
4,,,
...,...,...,...
440879,,,
440880,nm9272492,nm9272489,nm8349149
440881,nm10538579,nm10538578,nm10538577
440882,nm1957275,,


In [13]:
# merge the 5 dataframes on index using inner join
movie_dataset = pd.merge(movie_dataset, actor_split, left_index=True, right_index=True).merge(actress_split, left_index=True, right_index=True).merge(writer_split, left_index=True, right_index=True).merge(cinema_split, left_index=True, right_index=True)
movie_dataset

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,isAdult,releaseYear,runtimeMinutes,Action,Adult,Adventure,Animation,...,actress_2,actress_3,actress_4,actress_5,writer_1,writer_2,writer_3,cinematographer_1,cinematographer_2,cinematographer_3
0,0,tt0000502,Bohemios,False,1905,100,0,0,0,0,...,,,,,nm0675388,nm0063413,nm0657268,,,
1,1,tt0000574,The Story of the Kelly Gang,False,1906,70,1,0,1,0,...,,,,,nm0846879,,,nm0675239,,
2,2,tt0000591,The Prodigal Son,False,1907,90,0,0,0,0,...,nm1759558,,,,nm0141150,,,,,
3,3,tt0000615,Robbery Under Arms,False,1907,0,0,0,0,0,...,,,,,nm0533958,nm0092809,,nm0167619,,
4,4,tt0000630,Hamlet,False,1908,0,0,0,0,0,...,,,,,nm0000636,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440879,570470,tt9916538,Kuambil Lagi Hatiku,False,2019,123,0,0,0,0,...,nm1417182,nm1266058,,,nm4900525,nm4843252,nm2679404,,,
440880,570471,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,False,2015,57,0,0,0,0,...,,,,,nm9272491,nm9272490,,nm9272492,nm9272489,nm8349149
440881,570472,tt9916680,De la ilusión al desconcierto: cine colombiano...,False,2007,100,0,0,0,0,...,,,,,nm0652213,nm10538576,,nm10538579,nm10538578,nm10538577
440882,570474,tt9916730,6 Gunn,False,2017,116,0,0,0,0,...,,,,,nm10538612,,,nm1957275,,
