In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score


LOADING DATA

In [4]:
movies_df = pd.read_csv('.\Dataset\data\merged_dataset_cleaned.csv')

  movies_df = pd.read_csv('.\Dataset\data\merged_dataset_cleaned.csv')


In [5]:
movies_df.shape

(1118499, 22)

In [15]:
movies_df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,original_title,popularity,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,primary_director
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,160000000,...,Inception,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,Christopher Nolan
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,165000000,...,Interstellar,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan",Christopher Nolan
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,185000000,...,The Dark Knight,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",Christopher Nolan
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,237000000,...,Avatar,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",James Cameron,James Cameron,James Cameron
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,220000000,...,The Avengers,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",Joss Whedon,"Joss Whedon, Zak Penn",Joss Whedon


In [6]:
movies_df.isna().sum()

id                           0
title                       13
vote_average                 0
vote_count                   0
status                       0
release_date            178749
revenue                      0
runtime                      0
adult                        0
budget                       0
imdb_id                 520576
original_language            0
original_title              13
popularity                   0
genres                  445365
production_companies    611555
production_countries    492098
spoken_languages        473656
keywords                813093
directors               854197
writers                 879182
primary_director        854197
dtype: int64

DATA PREPROCESSING

In [7]:
movies_df.dropna(inplace=True)

In [10]:
# Even after removing the null values, we still have enough data to move ahead
movies_df.shape

(82703, 22)

In [12]:
movies_df.describe()

Unnamed: 0,id,vote_average,vote_count,revenue,runtime,budget,popularity
count,82703.0,82703.0,82703.0,82703.0,82703.0,82703.0,82703.0
mean,281081.7,5.185295,230.055935,8360483.0,92.847333,3083847.0,5.890876
std,267136.3,2.309306,1156.86833,58681790.0,30.269645,15521160.0,24.375741
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,57238.0,4.6645,2.0,0.0,82.0,0.0,1.162
50%,212836.0,5.833,9.0,0.0,93.0,0.0,2.253
75%,421814.0,6.647,49.0,0.0,105.0,0.0,6.038
max,1370747.0,10.0,34495.0,2923706000.0,1265.0,460000000.0,2994.357


In [13]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82703 entries, 0 to 1118455
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    82703 non-null  int64  
 1   title                 82703 non-null  object 
 2   vote_average          82703 non-null  float64
 3   vote_count            82703 non-null  int64  
 4   status                82703 non-null  object 
 5   release_date          82703 non-null  object 
 6   revenue               82703 non-null  int64  
 7   runtime               82703 non-null  int64  
 8   adult                 82703 non-null  bool   
 9   budget                82703 non-null  int64  
 10  imdb_id               82703 non-null  object 
 11  original_language     82703 non-null  object 
 12  original_title        82703 non-null  object 
 13  popularity            82703 non-null  float64
 14  genres                82703 non-null  object 
 15  production_companies  

In [14]:
# Some other pre processing and type conversions are required before we can proceed with the model building process. We will convert the genre column into a list of genres and then convert them into dummy variables. We will also convert the rating column into a binary column where 1 indicates a hit and 0 indicates a flop.
# # Other pre processing steps, 
# 1. remove the dollar sign in Budget and conver the column to numerical field.
# 2. conver the release date to datetime field.
# 3. Convert the runtime to time field(minutes).
# 4. Convert rating count to numerical field. 
# 5. make bins of the rating field to create the target variable field of HIT, AVERAGE, FLOP.
# 6. remove the unnamed field from the begining of the dataset.

# and name the final dataset as movies_cleaned.csv

## But, before all this, we will remove the non significant columns from the dataset. 

Since, Movies title, original title, ID, imdb_id, Status, directors(Keeping the primary directors only) won't be affecting whether the movies is a hit, avergae or flop

NOTE: This assumption might be wrong, or some extra features could be removed as well. 
We will update the features as we move ahead.

In [None]:
movies_df.drop(columns=['imdb_id', 'original_title', 'release_date', 'status', 'id'], axis=1, inplace=True)

In [21]:
movies_df.drop(columns=['title'], axis=1, inplace=True)

In [22]:
movies_df.head()

Unnamed: 0,vote_average,vote_count,revenue,runtime,adult,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,keywords,directors,writers,primary_director
0,8.364,34495,825532764,148,False,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",Christopher Nolan,Christopher Nolan,Christopher Nolan
1,8.417,32571,701729206,169,False,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan",Christopher Nolan
2,8.512,30619,1004558444,152,False,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",Christopher Nolan
3,7.573,29815,2923706026,162,False,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",James Cameron,James Cameron,James Cameron
4,7.71,29166,1518815515,143,False,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",Joss Whedon,"Joss Whedon, Zak Penn",Joss Whedon


In [23]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82703 entries, 0 to 1118455
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   vote_average          82703 non-null  float64
 1   vote_count            82703 non-null  int64  
 2   revenue               82703 non-null  int64  
 3   runtime               82703 non-null  int64  
 4   adult                 82703 non-null  bool   
 5   budget                82703 non-null  int64  
 6   original_language     82703 non-null  object 
 7   popularity            82703 non-null  float64
 8   genres                82703 non-null  object 
 9   production_companies  82703 non-null  object 
 10  production_countries  82703 non-null  object 
 11  spoken_languages      82703 non-null  object 
 12  keywords              82703 non-null  object 
 13  directors             82703 non-null  object 
 14  writers               82703 non-null  object 
 15  primary_director      

In [25]:
# The data objects are properly fomatted now. 
# We can move forward with the visualisation of the data.

DATA VISUALISATION