### Selecting columns, viualizing

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import itertools

In [2]:
data = pd.read_csv("../dat/data_clean_new.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273543 entries, 0 to 273542
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   tconst                       273543 non-null  object 
 1   titleType                    273543 non-null  object 
 2   primaryTitle                 273543 non-null  object 
 3   originalTitle                273543 non-null  object 
 4   isAdult                      273543 non-null  int64  
 5   startYear                    273543 non-null  object 
 6   endYear                      273543 non-null  object 
 7   runtimeMinutes               273543 non-null  object 
 8   genres                       273543 non-null  object 
 9   averageRating                273543 non-null  float64
 10  numVotes                     273543 non-null  int64  
 11  Budget                       49881 non-null   float64
 12  Gross US & Canada            18982 non-null   float64
 13 

In [4]:
data["Critic reviews"] = data["Critic reviews"].fillna(0)
data["User reviews"] = data["User reviews"].fillna(0)

data["isAdult2"] = data.apply(lambda row: int("Adult" in row["genres"]), axis=1)
print(len(data[data["isAdult"] != data["isAdult2"]]))  # We use the one based on the genres

# tconst was only required for joins
# titleType is only films for us, we filtered them
# we do not use the titles as predictors
# endYear is None for all films
# isAdult will be added back in a consistent format later on

# We drop writers and directors. These are interesting features,
# but having them as binary columns would be infeasible.
data = data.drop(columns=[
    "tconst", "titleType", "primaryTitle", "originalTitle", "endYear",
    "isAdult", "isAdult2", "Gross US & Canada", "Opening weekend US & Canada",
    "writers", "directors"])
data = data.dropna()

318


In [5]:
genre_list = data["genres"].unique().tolist()
for i, entry in enumerate(genre_list):
    genre_list[i] = entry.split(",")

genre_set = set(itertools.chain(*genre_list))
print(genre_set)

{'Horror', 'Music', 'Drama', 'Biography', 'Action', 'History', 'Thriller', 'Mystery', 'Sci-Fi', 'Musical', 'Crime', 'Fantasy', 'News', 'Film-Noir', 'Adventure', 'Family', 'Animation', 'Western', 'Documentary', 'Romance', 'Comedy', 'Sport', 'War'}


In [6]:
# News - History - Biography - Documentary --> Documentary
# Film-Noir - Crime --> Crime
# Western - Action --> Action
genre_set.difference_update(["News", "History", "Biography", "Film-Noir", "Western"])
transformation_dict = {
    "Documentary":  ["News", "History", "Biography", "Documentary"],
    "Crime": ["Film-Noir", "Crime"],
    "Action": ["Western", "Action"]
}
for genre in genre_set:
    print(genre, end=" ")
    if genre not in transformation_dict:
        transformation_dict[genre] = [genre]
    data[f"is{genre}"] = data.apply(lambda row: int(any(g in row["genres"] for g in transformation_dict[genre])), axis=1)

Horror Music Drama Action Thriller Mystery Sci-Fi Musical Crime Fantasy Adventure Family Animation Documentary Romance Comedy Sport War 

In [7]:
results = []
for genre in genre_set:
    results.append(data[f"is{genre}"].sum())

sum_results = sum(results)
for genre in genre_set:
    print(genre, data[f"is{genre}"].sum() / sum_results * 100, "% - ", data[f"is{genre}"].sum())

Horror 4.348486574759789 % -  1145
Music 1.7659792639854164 % -  465
Drama 21.920929702631877 % -  5772
Action 9.843910219892901 % -  2592
Thriller 6.63096730090008 % -  1746
Mystery 3.4901826744141884 % -  919
Sci-Fi 2.529338042611371 % -  666
Musical 0.5089058524173028 % -  134
Crime 7.569025103490183 % -  1993
Fantasy 2.8179712126390943 % -  742
Adventure 6.7107212031445815 % -  1767
Family 2.2938741407466483 % -  604
Animation 1.993847556112567 % -  525
Documentary 4.564961452280582 % -  1202
Romance 6.737305837226083 % -  1774
Comedy 14.488625574417988 % -  3815
Sport 0.9190687782461737 % -  242
War 0.8658995100831719 % -  228


In [8]:
# Genres are added as binary predictors, thus the genres column is no longer used.
data = data.drop(columns=["genres"])  # "isMusical", "isFilm-Noir", "isNews", "isSport", "genres"])

def unrated_to_not_rated(row):
    if row["Rating"] == "Unrated":
        return "Not Rated"
    else:
        return row["Rating"]

data["Rating"] = data.apply(unrated_to_not_rated, axis=1)

In [9]:
data[f"isRated"] = data.apply(lambda row: int(row["Rating"] != "Not Rated"), axis=1)

In [10]:
data = data.drop(columns=["Rating"])

In [11]:
def clean_unknowns(row, column):
    if row[column] == "\\N":
        return None
    else:
        return row[column]

def clean_reviews(row, column):
    if isinstance(row[column], str) and "K" in row[column]:
        # print(row[column], end=" -> ")
        if "." in row[column]:
            # print(int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100)
            return int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100
        else:
            # print(int(row[column][:-1]) * 1000)
            return int(row[column][:-1]) * 1000
    else:
        return row[column]

# Just an example of problematic data types
# print("Problematic form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

data["startYear"] = data.apply(lambda row: clean_unknowns(row, "startYear"), axis=1)
data["runtimeMinutes"] = data.apply(lambda row: clean_unknowns(row, "runtimeMinutes"), axis=1)
data["User reviews"] = data.apply(lambda row: clean_reviews(row, "User reviews"), axis=1)
data["Critic reviews"] = data.apply(lambda row: clean_reviews(row, "Critic reviews"), axis=1)

for column in ["startYear", "runtimeMinutes", "User reviews", "Critic reviews"]:
    data[column] = pd.to_numeric(data[column])

# print("Resolved form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

In [12]:
data["isFlop"] = (data["Gross worldwide"] < data["Budget"]).astype(float)
data = data.drop(columns=["Gross worldwide", "Budget"])
filtered = data.dropna()  # data.dropna()
print(len(filtered))
print()

10714



In [13]:
test_indices = np.random.choice(len(filtered), replace=False, size=int(len(filtered) / 10))
test_set = filtered.iloc[test_indices]
test_set, test_targets = test_set.drop("isFlop", axis=1).to_numpy(), test_set["isFlop"].to_numpy()
train_set = filtered.iloc[~test_indices]
train_set, train_targets = train_set.drop("isFlop", axis=1).to_numpy(), train_set["isFlop"].to_numpy()

In [14]:
filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10714 entries, 1031 to 273540
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   startYear       10714 non-null  int64  
 1   runtimeMinutes  10714 non-null  float64
 2   averageRating   10714 non-null  float64
 3   numVotes        10714 non-null  int64  
 4   Critic reviews  10714 non-null  float64
 5   User reviews    10714 non-null  float64
 6   isHorror        10714 non-null  int64  
 7   isMusic         10714 non-null  int64  
 8   isDrama         10714 non-null  int64  
 9   isAction        10714 non-null  int64  
 10  isThriller      10714 non-null  int64  
 11  isMystery       10714 non-null  int64  
 12  isSci-Fi        10714 non-null  int64  
 13  isMusical       10714 non-null  int64  
 14  isCrime         10714 non-null  int64  
 15  isFantasy       10714 non-null  int64  
 16  isAdventure     10714 non-null  int64  
 17  isFamily        10714 non-n

In [15]:
# from sklearn.linear_model import LinearRegression

# lr = LinearRegression().fit(train_set, train_targets)
# print(lr.predict(train_set) * 9 + 1)
# print(train_targets * 9 + 1)
# # print(lr.score(test_set, test_targets))

### Logistic Regression, BCE

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor

test_set = torch.from_numpy(test_set)
test_set_normalized = (test_set - test_set.mean(dim=0, keepdims=True)) / test_set.std(dim=0, keepdims=True)
test_set_normalized = torch.nan_to_num(test_set_normalized, nan=0)
test_targets = torch.from_numpy(test_targets)

train_set = torch.from_numpy(train_set)
train_set_normalized = (train_set - train_set.mean(dim=0, keepdims=True)) / train_set.std(dim=0, keepdims=True)
train_set_normalized = torch.nan_to_num(train_set_normalized, nan=0)
train_targets = torch.from_numpy(train_targets)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(25, 1, dtype=torch.double)

    def forward(self, x):
        return torch.sigmoid(self.layer1(x))

In [17]:
loss_fn = nn.BCELoss()

model = Model()
opt = torch.optim.SGD(model.parameters(), lr=0.001)

for epoch in range(2000):
    pred = model(train_set_normalized)
    loss = loss_fn(pred.squeeze(), train_targets)
    if epoch % 100 == 99:
        print(f"Epoch {epoch + 1}, Training Loss (BCE) {loss.item():.4f}", end=", ")
        with torch.no_grad():
            pred = model(test_set_normalized)
            binary_pred = pred.round().squeeze()
            acc = (binary_pred == test_targets).sum() / len(test_targets)
            
            val_loss = loss_fn(pred.squeeze(), test_targets)
            print(f"Validation Loss (BCE) {val_loss.item():.4f} Accuracy {acc.item()}")
        print()

    opt.zero_grad()
    loss.backward()
    opt.step()

Epoch 100, Training Loss (BCE) 0.6762, Validation Loss (BCE) 0.6582 Accuracy 0.597572386264801

Epoch 200, Training Loss (BCE) 0.6674, Validation Loss (BCE) 0.6504 Accuracy 0.608776867389679

Epoch 300, Training Loss (BCE) 0.6593, Validation Loss (BCE) 0.6433 Accuracy 0.6181139349937439

Epoch 400, Training Loss (BCE) 0.6520, Validation Loss (BCE) 0.6369 Accuracy 0.6237161755561829

Epoch 500, Training Loss (BCE) 0.6453, Validation Loss (BCE) 0.6310 Accuracy 0.6321195363998413

Epoch 600, Training Loss (BCE) 0.6391, Validation Loss (BCE) 0.6256 Accuracy 0.6535947918891907

Epoch 700, Training Loss (BCE) 0.6334, Validation Loss (BCE) 0.6206 Accuracy 0.6591970324516296

Epoch 800, Training Loss (BCE) 0.6281, Validation Loss (BCE) 0.6160 Accuracy 0.6657329797744751

Epoch 900, Training Loss (BCE) 0.6233, Validation Loss (BCE) 0.6118 Accuracy 0.6685341000556946

Epoch 1000, Training Loss (BCE) 0.6188, Validation Loss (BCE) 0.6079 Accuracy 0.6676003932952881

Epoch 1100, Training Loss (BCE)

### Large Model

In [19]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(25, 25, dtype=torch.double)
        self.layer2 = nn.Linear(25, 25, dtype=torch.double)
        self.layer3 = nn.Linear(25, 1, dtype=torch.double)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return torch.sigmoid(self.layer3(x))

In [20]:
loss_fn = nn.BCELoss()

model = Model()
opt = torch.optim.SGD(model.parameters(), lr=0.001)

for epoch in range(2000):
    pred = model(train_set_normalized)
    loss = loss_fn(pred.squeeze(), train_targets)
    if epoch % 100 == 99:
        print(f"Epoch {epoch + 1}, Training Loss (BCE) {loss.item():.4f}", end=", ")
        with torch.no_grad():
            pred = model(test_set_normalized)
            binary_pred = pred.round().squeeze()
            acc = (binary_pred == test_targets).sum() / len(test_targets)
            
            val_loss = loss_fn(pred.squeeze(), test_targets)
            print(f"Validation Loss (BCE) {val_loss.item():.4f} Accuracy {acc.item()}")
        print()

    opt.zero_grad()
    loss.backward()
    opt.step()

Epoch 100, Training Loss (BCE) 0.7042, Validation Loss (BCE) 0.7107 Accuracy 0.5004668831825256

Epoch 200, Training Loss (BCE) 0.7031, Validation Loss (BCE) 0.7093 Accuracy 0.5004668831825256

Epoch 300, Training Loss (BCE) 0.7020, Validation Loss (BCE) 0.7081 Accuracy 0.5004668831825256

Epoch 400, Training Loss (BCE) 0.7010, Validation Loss (BCE) 0.7070 Accuracy 0.5004668831825256

Epoch 500, Training Loss (BCE) 0.7001, Validation Loss (BCE) 0.7059 Accuracy 0.5004668831825256

Epoch 600, Training Loss (BCE) 0.6992, Validation Loss (BCE) 0.7049 Accuracy 0.5004668831825256

Epoch 700, Training Loss (BCE) 0.6983, Validation Loss (BCE) 0.7040 Accuracy 0.5004668831825256

Epoch 800, Training Loss (BCE) 0.6976, Validation Loss (BCE) 0.7031 Accuracy 0.5004668831825256

Epoch 900, Training Loss (BCE) 0.6968, Validation Loss (BCE) 0.7023 Accuracy 0.5004668831825256

Epoch 1000, Training Loss (BCE) 0.6961, Validation Loss (BCE) 0.7015 Accuracy 0.5004668831825256

Epoch 1100, Training Loss (BC