In [1]:
#Importing necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re

In [2]:
#Loading Movies Dataset
data = pd.read_csv("movies_metadata.csv",usecols=['id','genres','budget','revenue', 'release_date'])
new_cols = ['id','genres','budget','revenue', 'release_date']
data=data[new_cols]
data

Unnamed: 0,id,genres,budget,revenue,release_date
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",30000000,373554033.0,1995-10-30
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",65000000,262797249.0,1995-12-15
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",0,0.0,1995-12-22
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",16000000,81452156.0,1995-12-22
4,11862,"[{'id': 35, 'name': 'Comedy'}]",0,76578911.0,1995-02-10
...,...,...,...,...,...
45461,439050,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",0,0.0,
45462,111109,"[{'id': 18, 'name': 'Drama'}]",0,0.0,2011-11-17
45463,67758,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",0,0.0,2003-08-01
45464,227506,[],0,0.0,1917-10-21


In [3]:
#Identifying Null Values
data.isna().sum()

id               0
genres           0
budget           0
revenue          6
release_date    87
dtype: int64

In [4]:
#Eliminating Null Values and duplicates
data = data.dropna().reset_index(drop=True)
data = data.drop_duplicates().reset_index(drop=True)

In [5]:
#Cleaning the genres column and keeping only one primary genre
keywords_path = "/kaggle/input/the-movies-dataset/keywords.csv" #loading keywords path for cleaning the genres
genres = data[["id", "genres"]]
genres["genres"] = genres["genres"].apply(lambda x: re.findall(r"'name': '(.*?)'}", x))
genres = genres.explode("genres")
genres.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres["genres"] = genres["genres"].apply(lambda x: re.findall(r"'name': '(.*?)'}", x))


Unnamed: 0,id,genres
0,862,Animation
0,862,Comedy
0,862,Family
1,8844,Adventure
1,8844,Fantasy


In [6]:
data.drop('genres', axis=1, inplace=True)#dropping the initial genres column in the original data
data.head()

Unnamed: 0,id,budget,revenue,release_date
0,862,30000000,373554033.0,1995-10-30
1,8844,65000000,262797249.0,1995-12-15
2,15602,0,0.0,1995-12-22
3,31357,16000000,81452156.0,1995-12-22
4,11862,0,76578911.0,1995-02-10


In [7]:
data = data.merge(genres,on='id')#merging the cleaned genres column with original data
data

Unnamed: 0,id,budget,revenue,release_date,genres
0,862,30000000,373554033.0,1995-10-30,Animation
1,862,30000000,373554033.0,1995-10-30,Comedy
2,862,30000000,373554033.0,1995-10-30,Family
3,8844,65000000,262797249.0,1995-12-15,Adventure
4,8844,65000000,262797249.0,1995-12-15,Fantasy
...,...,...,...,...,...
93336,67758,0,0.0,2003-08-01,Action
93337,67758,0,0.0,2003-08-01,Drama
93338,67758,0,0.0,2003-08-01,Thriller
93339,227506,0,0.0,1917-10-21,


In [8]:
data['month'] = pd.DatetimeIndex(data['release_date']).month#cleaning the release date column and keeoing only month 
data

Unnamed: 0,id,budget,revenue,release_date,genres,month
0,862,30000000,373554033.0,1995-10-30,Animation,10
1,862,30000000,373554033.0,1995-10-30,Comedy,10
2,862,30000000,373554033.0,1995-10-30,Family,10
3,8844,65000000,262797249.0,1995-12-15,Adventure,12
4,8844,65000000,262797249.0,1995-12-15,Fantasy,12
...,...,...,...,...,...,...
93336,67758,0,0.0,2003-08-01,Action,8
93337,67758,0,0.0,2003-08-01,Drama,8
93338,67758,0,0.0,2003-08-01,Thriller,8
93339,227506,0,0.0,1917-10-21,,10


In [9]:
data.drop('release_date', axis=1, inplace=True)#dropping the unwanted original release date column
data.head()

Unnamed: 0,id,budget,revenue,genres,month
0,862,30000000,373554033.0,Animation,10
1,862,30000000,373554033.0,Comedy,10
2,862,30000000,373554033.0,Family,10
3,8844,65000000,262797249.0,Adventure,12
4,8844,65000000,262797249.0,Fantasy,12


In [10]:
data.drop(data.index[data['revenue'] == 0], inplace=True)#eliminating zeores in revenue column
data.drop(data.index[data['budget'] == 0], inplace=True) #eliminating zeroes in budget column

In [11]:
print(data.dtypes)#checking the datatypes of the columns

id          object
budget      object
revenue    float64
genres      object
month        int64
dtype: object


In [12]:
data['budget'] = data['budget'].astype(int)#converting budget to int
data['profit'] = data.revenue - data.budget#calculating proft
data

Unnamed: 0,id,budget,revenue,genres,month,profit
0,862,30000000,373554033.0,Animation,10,343554033.0
1,862,30000000,373554033.0,Comedy,10,343554033.0
2,862,30000000,373554033.0,Family,10,343554033.0
3,8844,65000000,262797249.0,Adventure,12,197797249.0
4,8844,65000000,262797249.0,Fantasy,12,197797249.0
...,...,...,...,...,...,...
93234,63281,2000000,1268793.0,Drama,9,-731207.0
93251,63898,5000000,1413000.0,Action,9,-3587000.0
93252,63898,5000000,1413000.0,Comedy,9,-3587000.0
93253,63898,5000000,1413000.0,Crime,9,-3587000.0


In [13]:
data['success_percentage'] = data.profit/data.budget*100#calculating success percentage
data

Unnamed: 0,id,budget,revenue,genres,month,profit,success_percentage
0,862,30000000,373554033.0,Animation,10,343554033.0,1145.18011
1,862,30000000,373554033.0,Comedy,10,343554033.0,1145.18011
2,862,30000000,373554033.0,Family,10,343554033.0,1145.18011
3,8844,65000000,262797249.0,Adventure,12,197797249.0,304.30346
4,8844,65000000,262797249.0,Fantasy,12,197797249.0,304.30346
...,...,...,...,...,...,...,...
93234,63281,2000000,1268793.0,Drama,9,-731207.0,-36.56035
93251,63898,5000000,1413000.0,Action,9,-3587000.0,-71.74000
93252,63898,5000000,1413000.0,Comedy,9,-3587000.0,-71.74000
93253,63898,5000000,1413000.0,Crime,9,-3587000.0,-71.74000


In [14]:
#assigning a success index value based on success percentage
data.loc[data['success_percentage'].between(-100, 0), 'sidx_range'] = 1
data.loc[data['success_percentage'].between(0, 100), 'sidx_range'] = 2
data.loc[data['success_percentage'].between(100, 200), 'sidx_range'] = 3
data.loc[data['success_percentage'].between(200, 300), 'sidx_range'] = 4
data.loc[data['success_percentage'].between(300, 400), 'sidx_range'] = 5
data.loc[data['success_percentage'].between(400, 500), 'sidx_range'] = 6
data.loc[data['success_percentage'].between(500, 600), 'sidx_range'] = 7
data.loc[data['success_percentage'].between(600, 700), 'sidx_range'] = 8
data.loc[data['success_percentage'].between(700, 800), 'sidx_range'] = 9
data.loc[data['success_percentage'].between(800, 900), 'sidx_range'] = 10
data.loc[data['success_percentage'].between(900, 1000), 'sidx_range'] = 11
data.loc[data['success_percentage'].between(1000, 1500), 'sidx_range'] = 12
data.loc[data['success_percentage'].between(1500, 2000), 'sidx_range'] = 13
data.loc[data['success_percentage'].between(2000, 2500), 'sidx_range'] = 14
data.loc[data['success_percentage'].between(2500, 3000), 'sidx_range'] = 15
data.loc[data['success_percentage'].between(3000, 3500), 'sidx_range'] = 16
data.loc[data['success_percentage'].between(3500, 4000), 'sidx_range'] = 17
data.loc[data['success_percentage'].between(4000, 4500), 'sidx_range'] = 18
data.loc[data['success_percentage'].between(5000, 5500), 'sidx_range'] = 19
data.loc[data['success_percentage'].between(5500, 20000), 'sidx_range'] = 20
data

Unnamed: 0,id,budget,revenue,genres,month,profit,success_percentage,sidx_range
0,862,30000000,373554033.0,Animation,10,343554033.0,1145.18011,12.0
1,862,30000000,373554033.0,Comedy,10,343554033.0,1145.18011,12.0
2,862,30000000,373554033.0,Family,10,343554033.0,1145.18011,12.0
3,8844,65000000,262797249.0,Adventure,12,197797249.0,304.30346,5.0
4,8844,65000000,262797249.0,Fantasy,12,197797249.0,304.30346,5.0
...,...,...,...,...,...,...,...,...
93234,63281,2000000,1268793.0,Drama,9,-731207.0,-36.56035,1.0
93251,63898,5000000,1413000.0,Action,9,-3587000.0,-71.74000,1.0
93252,63898,5000000,1413000.0,Comedy,9,-3587000.0,-71.74000,1.0
93253,63898,5000000,1413000.0,Crime,9,-3587000.0,-71.74000,1.0


In [15]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)#replacing infinite values with NaN
data.dropna(inplace=True)#dropping NaN values
data

Unnamed: 0,id,budget,revenue,genres,month,profit,success_percentage,sidx_range
0,862,30000000,373554033.0,Animation,10,343554033.0,1145.18011,12.0
1,862,30000000,373554033.0,Comedy,10,343554033.0,1145.18011,12.0
2,862,30000000,373554033.0,Family,10,343554033.0,1145.18011,12.0
3,8844,65000000,262797249.0,Adventure,12,197797249.0,304.30346,5.0
4,8844,65000000,262797249.0,Fantasy,12,197797249.0,304.30346,5.0
...,...,...,...,...,...,...,...,...
93234,63281,2000000,1268793.0,Drama,9,-731207.0,-36.56035,1.0
93251,63898,5000000,1413000.0,Action,9,-3587000.0,-71.74000,1.0
93252,63898,5000000,1413000.0,Comedy,9,-3587000.0,-71.74000,1.0
93253,63898,5000000,1413000.0,Crime,9,-3587000.0,-71.74000,1.0


In [17]:
#converting success index into a human language and labelling
data["labels"] = data["sidx_range"].map({1: "There are very high chances for the movie to be a disaster", 
                                    2: "Break even is assured", 
                                    3: "Decent box office hit with assured profits",
                                    4: "There are chances that the movie would be a super hit",
                                    5: "Very high chances to be a Super duper hit",
                                    6: "The movie would be Block Buster hit",
                                    7: "Assured Block buster",
                                    8: "Assured Block buster",
                                    9: "Assured Mega Block buster",
                                    10: "Assured Mega Block buster",
                                    11: "This could break few of the industry collection records",
                                    12: "This could break few of the industry collection records" ,
                                    13: "This could break few of the industry collection records",
                                    14: "This could break few of the industry collection records",
                                    15: "This could break few of the industry collection records",
                                    16: "World wide hit with record breaking collections",
                                    17: "World wide hit with record breaking collections",
                                    18: "World wide hit with record breaking collections",
                                    19: "World wide hit with record breaking collections",
                                    20: "You would create a world sensation with magnanimous collections"})
print(data.head())

     id    budget      revenue     genres  month       profit  \
0   862  30000000  373554033.0  Animation     10  343554033.0   
1   862  30000000  373554033.0     Comedy     10  343554033.0   
2   862  30000000  373554033.0     Family     10  343554033.0   
3  8844  65000000  262797249.0  Adventure     12  197797249.0   
4  8844  65000000  262797249.0    Fantasy     12  197797249.0   

   success_percentage  sidx_range  \
0          1145.18011        12.0   
1          1145.18011        12.0   
2          1145.18011        12.0   
3           304.30346         5.0   
4           304.30346         5.0   

                                              labels  
0  This could break few of the industry collectio...  
1  This could break few of the industry collectio...  
2  This could break few of the industry collectio...  
3          Very high chances to be a Super duper hit  
4          Very high chances to be a Super duper hit  


In [18]:
data['month'] = data['month'].astype(str)#converting month column to object for merging
#merging genres and month column and making it as a input for the predictor model
data["X"] = data['genres'] +"-"+ data["month"]
print(data)

          id    budget      revenue     genres month       profit  \
0        862  30000000  373554033.0  Animation    10  343554033.0   
1        862  30000000  373554033.0     Comedy    10  343554033.0   
2        862  30000000  373554033.0     Family    10  343554033.0   
3       8844  65000000  262797249.0  Adventure    12  197797249.0   
4       8844  65000000  262797249.0    Fantasy    12  197797249.0   
...      ...       ...          ...        ...   ...          ...   
93234  63281   2000000    1268793.0      Drama     9    -731207.0   
93251  63898   5000000    1413000.0     Action     9   -3587000.0   
93252  63898   5000000    1413000.0     Comedy     9   -3587000.0   
93253  63898   5000000    1413000.0      Crime     9   -3587000.0   
93254  63898   5000000    1413000.0    Foreign     9   -3587000.0   

       success_percentage  sidx_range  \
0              1145.18011        12.0   
1              1145.18011        12.0   
2              1145.18011        12.0   
3      

In [19]:
#applying count vectorizer and using decision tree classifier for the prediction model
x = np.array(data["X"])
y = np.array(data["labels"])

cv = CountVectorizer()
X = cv.fit_transform(x) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

In [20]:
#giving a sample and predicting the outcome
sample = "Animation-12"#sample should be in genre-month number format
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['There are very high chances for the movie to be a disaster']


In [21]:
sample = "Animation-11"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['Decent box office hit with assured profits']


In [22]:
sample = "Family-1"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['Break even is assured']


In [23]:
sample = "Family-11"
data = cv.transform([sample]).toarray()
print(clf.predict(data))

['Decent box office hit with assured profits']
