### Selecting columns, viualizing

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import itertools

In [2]:
data = pd.read_csv("../dat/data_clean_new.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110155 entries, 0 to 110154
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   tconst                       110155 non-null  object 
 1   titleType                    110155 non-null  object 
 2   primaryTitle                 110155 non-null  object 
 3   originalTitle                110155 non-null  object 
 4   isAdult                      110155 non-null  int64  
 5   startYear                    110155 non-null  object 
 6   endYear                      110155 non-null  object 
 7   runtimeMinutes               110155 non-null  object 
 8   genres                       110155 non-null  object 
 9   averageRating                110155 non-null  float64
 10  numVotes                     110155 non-null  int64  
 11  Budget                       26636 non-null   float64
 12  Gross US & Canada            18139 non-null   float64
 13 

In [4]:
data["Critic reviews"] = data["Critic reviews"].fillna(0)
data["User reviews"] = data["User reviews"].fillna(0)

data["isAdult2"] = data.apply(lambda row: int("Adult" in row["genres"]), axis=1)
print(len(data[data["isAdult"] != data["isAdult2"]]))  # We use the one based on the genres

# tconst was only required for joins
# titleType is only films for us, we filtered them
# we do not use the titles as predictors
# endYear is None for all films
# isAdult will be added back in a consistent format later on
data = data.drop(columns=["tconst", "titleType", "primaryTitle", "originalTitle", "endYear", "isAdult", "isAdult2"])

52


In [5]:
genre_list = data["genres"].unique().tolist()
for i, entry in enumerate(genre_list):
    genre_list[i] = entry.split(",")

genre_set = set(itertools.chain(*genre_list))
print(genre_set)
# Here short is a weird category, as there is also a short category in the titleType column, where we only
# kept films. Here the IMDb database is a bit inconsistent.

{'Family', 'Thriller', 'Music', 'Documentary', 'Biography', '\\N', 'Adventure', 'Horror', 'News', 'Crime', 'Drama', 'Mystery', 'Adult', 'Western', 'Reality-TV', 'Sport', 'Musical', 'Talk-Show', 'Sci-Fi', 'Action', 'Short', 'Romance', 'Film-Noir', 'Fantasy', 'Comedy', 'War', 'Animation', 'History'}


In [6]:
for genre in genre_set:
    print(genre, end=" ")
    data[f"is{genre}"] = data.apply(lambda row: int(genre in row["genres"]), axis=1)

Family Thriller Music Documentary Biography \N Adventure Horror News Crime Drama Mystery Adult Western Reality-TV Sport Musical Talk-Show Sci-Fi Action Short Romance Film-Noir Fantasy Comedy War Animation History 

In [7]:
for genre in genre_set:
    print(genre, data[f"is{genre}"].sum())

Family 4924
Thriller 13390
Music 5055
Documentary 9102
Biography 4000
\N 45
Adventure 8810
Horror 11282
News 152
Crime 13252
Drama 55816
Mystery 6228
Adult 842
Western 2141
Reality-TV 8
Sport 1679
Musical 2314
Talk-Show 1
Sci-Fi 4113
Action 14954
Short 4
Romance 16347
Film-Noir 700
Fantasy 4486
Comedy 34408
War 2741
Animation 2603
History 3462


In [8]:
# We drop categories that do not have enough samples to be representative
# We draw the line at 100
# We also drop writers and directors. These are interesting features,
# but having them as binary columns would be infeasible.
# Genres are added as binary predictors, thus the genres column is no longer used.
data = data.drop(columns=["isShort", "isReality-TV", "is\\N", "isTalk-Show", "directors", "writers", "genres"])

In [9]:
for rating in data.Rating.unique():
    print(rating, end=" ")
    data[f"is{rating}"] = data.apply(lambda row: int(row["Rating"] == rating), axis=1)

Not Rated nan Unrated TV-PG Passed Approved TV-14 TV-G PG-13 G TV-MA PG Open GP R M/PG TV-Y7 M X TV-13 TV-Y7-FV NC-17 AO (Banned) E 12 TV-Y 18 E10+ MA-17 

In [10]:
for rating in data.Rating.unique():
    print(rating, data[f"is{rating}"].sum())
    
# Not needed anymore, we have the binary version
data = data.drop(columns=["Rating"])

Not Rated 17643
nan 0
Unrated 2354
TV-PG 711
Passed 3356
Approved 4590
TV-14 1414
TV-G 238
PG-13 4854
G 1187
TV-MA 1724
PG 4469
Open 4
GP 185
R 15231
M/PG 53
TV-Y7 56
M 78
X 693
TV-13 3
TV-Y7-FV 9
NC-17 60
AO 1
(Banned) 1
E 1
12 1
TV-Y 17
18 1
E10+ 2
MA-17 1


In [11]:
# Again, we drop ratings that do not have enough samples to be representative
# We draw the line at 100
data = data.drop(columns=[
    "isnan", "isOpen", "isM/PG", "isTV-Y7", "isM", "isTV-13",
    "isTV-Y7-FV", "isNC-17", "isAO", "is(Banned)", "isE",
    "is12", "isTV-Y", "is18", "isE10+", "isMA-17"]
)

In [12]:
def clean_unknowns(row, column):
    if row[column] == "\\N":
        return None
    else:
        return row[column]

def clean_reviews(row, column):
    if isinstance(row[column], str) and "K" in row[column]:
        # print(row[column], end=" -> ")
        if "." in row[column]:
            # print(int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100)
            return int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100
        else:
            # print(int(row[column][:-1]) * 1000)
            return int(row[column][:-1]) * 1000
    else:
        return row[column]

# Just an example of problematic data types
# print("Problematic form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

data["startYear"] = data.apply(lambda row: clean_unknowns(row, "startYear"), axis=1)
data["runtimeMinutes"] = data.apply(lambda row: clean_unknowns(row, "runtimeMinutes"), axis=1)
data["User reviews"] = data.apply(lambda row: clean_reviews(row, "User reviews"), axis=1)
data["Critic reviews"] = data.apply(lambda row: clean_reviews(row, "Critic reviews"), axis=1)

for column in ["startYear", "runtimeMinutes", "User reviews", "Critic reviews"]:
    data[column] = pd.to_numeric(data[column])

# print("Resolved form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

In [13]:
print("All features present:")
filtered = data.dropna()
print(len(filtered))
print()

print("Opening weekend not present:")
filtered2 = data.drop(columns=["Opening weekend US & Canada"]).dropna()
print(len(filtered2))
print()

print("Opening weekend + Gross not present:")
filtered3 = data.drop(columns=["Opening weekend US & Canada", "Gross US & Canada"]).dropna()
print(len(filtered3))
print()

All features present:
8073

Opening weekend not present:
8620

Opening weekend + Gross not present:
13624



In [14]:
filtered.corr()

Unnamed: 0,startYear,runtimeMinutes,averageRating,numVotes,Budget,Gross US & Canada,Opening weekend US & Canada,Gross worldwide,User reviews,Critic reviews,...,isApproved,isTV-14,isTV-G,isPG-13,isG,isTV-MA,isPG,isGP,isR,isX
startYear,1.0,0.001003,-0.08072,0.04243,0.0149,-0.179838,0.072059,-0.03676,0.12859,0.35217,...,-0.132782,0.032058,0.014647,0.117452,-0.11165,0.044498,-0.128233,-0.050251,-0.066547,-0.01621
runtimeMinutes,0.001003,1.0,0.337309,0.300885,0.304281,0.194497,0.170161,0.245731,0.297022,0.229912,...,0.049866,-0.028034,-0.006791,0.068264,-0.110018,0.016865,-0.08811,0.12478,0.013923,0.003302
averageRating,-0.08072,0.337309,1.0,0.408141,0.074693,0.203811,0.089628,0.213532,0.268807,0.33186,...,0.034135,0.026646,0.005729,-0.068296,0.035584,-0.006205,-0.056195,0.020887,0.065647,0.01044
numVotes,0.04243,0.300885,0.408141,1.0,0.466785,0.509877,0.53977,0.595695,0.739408,0.614465,...,0.005069,-0.019538,-0.011442,0.117439,0.021562,-0.023005,-0.025498,-0.00762,0.0125,-0.000122
Budget,0.0149,0.304281,0.074693,0.466785,1.0,0.486335,0.709083,0.61211,0.471817,0.43152,...,0.00289,-0.031423,-0.017393,0.252904,0.075439,-0.035387,0.13647,0.000345,-0.182905,0.000374
Gross US & Canada,-0.179838,0.194497,0.203811,0.509877,0.486335,1.0,0.639254,0.913438,0.423301,0.322847,...,0.128511,-0.018916,-0.010181,0.107749,0.139601,-0.025607,0.122569,-0.007094,-0.130442,-0.001229
Opening weekend US & Canada,0.072059,0.170161,0.089628,0.53977,0.709083,0.639254,1.0,0.734415,0.554511,0.48481,...,0.003987,-0.022199,-0.01197,0.231436,0.040319,-0.030013,0.090342,-0.008373,-0.154345,-0.001679
Gross worldwide,-0.03676,0.245731,0.213532,0.595695,0.61211,0.913438,0.734415,1.0,0.520211,0.425544,...,0.055115,-0.018238,-0.009868,0.145622,0.102943,-0.011154,0.104445,-0.006932,-0.150617,-0.003081
User reviews,0.12859,0.297022,0.268807,0.739408,0.471817,0.423301,0.554511,0.520211,1.0,0.615209,...,0.000998,-0.020127,-0.012103,0.143163,-0.020022,-0.017723,-0.048276,-0.007274,0.01467,7.6e-05
Critic reviews,0.35217,0.229912,0.33186,0.614465,0.43152,0.322847,0.48481,0.425544,0.615209,1.0,...,-0.006163,-0.028731,-0.020148,0.152053,-0.033897,-0.031785,-0.083308,-0.01035,0.070951,0.002136


In [15]:
filtered3.isAdult.unique()

array([0], dtype=int64)