In [305]:
#ALL IMPORTS
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [306]:
data = pd.read_csv("movie_metadata.csv", header=0)

In [307]:
data.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [308]:
data.shape

(5043, 28)

# PREPROCESSING DATASET

Dropping Duplicate Records

In [309]:
data.duplicated().any()

True

In [310]:
data=data.drop_duplicates()
data.shape

(4998, 28)

Dropping Correlated Columns

In [311]:
data.drop(["director_name","actor_2_name","genres","movie_title","actor_1_name","actor_3_name","language","country","plot_keywords","movie_imdb_link","cast_total_facebook_likes"],axis=1, inplace = True)

In [312]:
data.shape

(4998, 17)

HANDLING NULL VALUES

In [313]:
data.isnull().sum()

color                       19
num_critic_for_reviews      49
duration                    15
director_facebook_likes    103
actor_3_facebook_likes      23
actor_1_facebook_likes       7
gross                      874
num_voted_users              0
facenumber_in_poster        13
num_user_for_reviews        21
content_rating             301
budget                     487
title_year                 107
actor_2_facebook_likes      13
imdb_score                   0
aspect_ratio               327
movie_facebook_likes         0
dtype: int64

In [314]:
data = data.dropna()

In [315]:
data.shape

(3738, 17)

In [316]:
data.isnull().sum()

color                      0
num_critic_for_reviews     0
duration                   0
director_facebook_likes    0
actor_3_facebook_likes     0
actor_1_facebook_likes     0
gross                      0
num_voted_users            0
facenumber_in_poster       0
num_user_for_reviews       0
content_rating             0
budget                     0
title_year                 0
actor_2_facebook_likes     0
imdb_score                 0
aspect_ratio               0
movie_facebook_likes       0
dtype: int64

Converting Categorical Values

In [317]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3738 entries, 0 to 5042
Data columns (total 17 columns):
color                      3738 non-null object
num_critic_for_reviews     3738 non-null float64
duration                   3738 non-null float64
director_facebook_likes    3738 non-null float64
actor_3_facebook_likes     3738 non-null float64
actor_1_facebook_likes     3738 non-null float64
gross                      3738 non-null float64
num_voted_users            3738 non-null int64
facenumber_in_poster       3738 non-null float64
num_user_for_reviews       3738 non-null float64
content_rating             3738 non-null object
budget                     3738 non-null float64
title_year                 3738 non-null float64
actor_2_facebook_likes     3738 non-null float64
imdb_score                 3738 non-null float64
aspect_ratio               3738 non-null float64
movie_facebook_likes       3738 non-null int64
dtypes: float64(13), int64(2), object(2)
memory usage: 525.7+ KB


In [318]:
data["gross"].value_counts()

8000000.0      3
177343675.0    3
218051260.0    3
26505000.0     2
10499968.0     2
              ..
28747570.0     1
102055.0       1
25863915.0     1
92969824.0     1
162.0          1
Name: gross, Length: 3653, dtype: int64

Preparing Target Variables for Classification

Converting IMDB SCORE into 3 classes

In [319]:
data["imdb_score"].unique()

array([7.9, 7.1, 6.8, 8.5, 6.6, 6.2, 7.8, 7.5, 6.9, 6.1, 6.7, 7.3, 6.5,
       7.2, 8.1, 7. , 7.7, 8.2, 5.9, 6. , 5.7, 6.4, 6.3, 5.6, 8.3, 8. ,
       8.4, 5.8, 5.4, 9. , 4.8, 5.2, 7.6, 5.5, 8.6, 8.8, 5.1, 7.4, 4.2,
       5. , 4.9, 3.7, 5.3, 4.3, 3.8, 4.4, 3.3, 2.2, 8.9, 8.7, 4.6, 2.4,
       3.4, 4.1, 4.7, 4.5, 3. , 3.6, 3.5, 4. , 2.7, 9.3, 2.9, 2.8, 3.9,
       2.3, 1.9, 3.1, 1.6, 2.5, 2.1, 9.2, 2. , 3.2])

In [320]:
data["imdb_score"] = data["imdb_score"].apply(float)

In [321]:
data.loc[data['imdb_score'].between(8,10), 'imdb_score'] = 100.0
data.loc[data['imdb_score'].between(5,7.99), 'imdb_score'] = 50.0
data.loc[data['imdb_score'].between(0,4.992), 'imdb_score'] = 30.0
data["imdb_score"] = data["imdb_score"].apply(str)
data.loc[data['imdb_score'] == "100.0", 'imdb_score'] = "BEST"
data.loc[data['imdb_score'] == "50.0", 'imdb_score'] = "AVERAGE"
data.loc[data['imdb_score'] == "30.0", 'imdb_score'] = "BAD"

In [322]:
data["imdb_score"].unique()

array(['AVERAGE', 'BEST', 'BAD'], dtype=object)

Converting Content Ratings into 3 classes

In [323]:
ratings = data["content_rating"].unique()
ratings = np.delete(ratings, [0,3])
ratings

array(['PG', 'G', 'Approved', 'NC-17', 'X', 'Not Rated', 'Unrated', 'M',
       'GP', 'Passed'], dtype=object)

In [324]:
data["content_rating"].value_counts()

R            1696
PG-13        1295
PG            564
G              87
Not Rated      34
Unrated        23
Approved       17
X              10
NC-17           6
Passed          3
M               2
GP              1
Name: content_rating, dtype: int64

In [325]:
for rate in ratings:
    data.loc[data['content_rating'] == rate, 'content_rating'] = "Other"

In [326]:
data["content_rating"].value_counts()

R        1696
PG-13    1295
Other     747
Name: content_rating, dtype: int64

Converting gross into 2 classes

In [327]:
data["gross"].value_counts()

8000000.0      3
177343675.0    3
218051260.0    3
26505000.0     2
10499968.0     2
              ..
28747570.0     1
102055.0       1
25863915.0     1
92969824.0     1
162.0          1
Name: gross, Length: 3653, dtype: int64

In [328]:
data.loc[data['gross'].between(0,30000000.0), 'gross'] = 100.0
data.loc[data['gross'].between(3000000.0,762000000.0), 'gross'] = 30.0

In [329]:
data["gross"].value_counts()

100.0    1873
30.0     1865
Name: gross, dtype: int64

In [330]:
data.loc[data['gross'] == 100.0, 'gross'] = "Above Average"
data.loc[data['gross'] == 30.0, 'gross'] = "Below AVerage"

In [331]:
data["gross"].value_counts()

Above Average    1873
Below AVerage    1865
Name: gross, dtype: int64

In [333]:
target1 = data["imdb_score"]
target2 = data["content_rating"]
target3 = data["gross"]

In [334]:
X = data.drop(["imdb_score", "content_rating", "gross"],axis=1)