# Import Data

In [1]:
#Import packages
import pandas as pd
import numpy as np

import re

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score

from sklearn.metrics import classification_report

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'xgboost'

In [None]:
!pip install imblearn
from imblearn.over_sampling import SMOTE

In [None]:
imdb1 = pd.read_csv('IMDbScrapeFull.csv')
imdb2 = pd.read_csv('IMDbScrape2.csv')

In [None]:
imdbfull = pd.concat([imdb1,imdb2], axis=0)
movie = imdbfull.drop_duplicates()

In [None]:
movie

# Begin Cleaning
    - Drop domestic gross profit and use worldwide gross profit
    - Clean worldwide gross profit
    - Clean budget

In [None]:
if 'DomesticGross' in movie.columns:
    
    flag = True # used in next block to ensure rerunning block doesn't throw error
    
    movie = movie.iloc[89:,:] # drop movies made in 2021 since incomplete data
    movie = movie.drop(labels = ['DomesticGross'], axis = 1) # drop domestic, worldwide is more important
    movie['WorldwideGross'] = movie['WorldwideGross'].str.replace(',','')
    movie['Budget'] = movie['Budget'].str.replace(',','')
    movie['Budget'] = movie['Budget'].str.split(' ').str[0]
    movie = movie.dropna(subset = ['WorldwideGross'], axis = 0)
    movie = movie.reset_index(drop = True)

    for i in range(movie['WorldwideGross'].shape[0]): 
        if (movie['WorldwideGross'].loc[i] == 'Non') | (movie['WorldwideGross'].loc[i][0] == '<'):
            movie = movie.drop(index = i)
        else:
            continue
            
    movie = movie.reset_index(drop = True)

            
    for i in range(movie['Budget'].shape[0]): 
        if (movie['Budget'].loc[i] == 'Non') | (movie['Budget'].loc[i][0] == '<'):
            movie = movie.drop(index = i)
        else:
            continue
            
    movie = movie.reset_index(drop = True)
    
    

movie['WorldwideGross'] = movie['WorldwideGross'].astype(int)
movie['Budget'] = movie['Budget'].astype(int)
movie

# Organizing, reshuffling, renaming etc. 

In [None]:
if flag: 
    flag = False
    movie = movie.rename(columns = {'Release_Date': 'Year','ViewRating':'Rating'})
    movie['Revenue-Budget-Ratio'] = movie['WorldwideGross'] / movie['Budget']
    movie['Year'] = movie['Year'].astype(int)
    movie['Runtime'] = movie['Runtime'].astype(int)
    
    movie['Rating'] = movie['Rating'].fillna('Unknown')

# VISUALIZATION: 

print(movie['Revenue-Budget-Ratio'].describe()) # aggregate statistics
print('')
print('Median:',movie['Revenue-Budget-Ratio'].median()) 


target = movie['Revenue-Budget-Ratio']
bins = np.arange(0,20,1)
plt.figure(figsize = (10,6))
plt.hist(target, bins = bins)
plt.title('Distribution of Revenue-Budget-Ratio')
plt.xlabel('Revenue-Budget-Ratio')
plt.ylabel('Number of movies')

plt.figure(figsize = (10,6))
target.plot(kind = 'kde', color = 'orange' )
plt.title('KDE')
plt.xlabel('Revenue-Budget-Ratio')
plt.xlim(-1,20)

plt.figure(figsize = (10,6))
plt.boxplot(target)
plt.title('Boxplot')
plt.ylabel('Revenue-Budget-Ratio')
plt.ylim(-1,35)


# Distribution is clearly exponential or poisson - I THINK??
Analysis: 

This makes sense. A larger ratio is a financially successful movie. Many movies are not successful, and in fact for every ~10 movies only about 3 are successful, paying for the 7 unsuccessful ones. Subsequently, it makes perfect sense that the distribution of this revenue/budget ratio is skewed heavily towards the lower end. 

A ratio of 1 means that the company actually lost money on the movie, since revenue does not take into account costs, so the profit would be less than the budget (i.e. a loss). 

The median of the data is 1.29 and the mean is 2.48. Therefore, while we don't have a lot of data, it does seem to be representative of the overall larger distribution of movies. The median revenue/budget ratio is a financial loss, and the mean is a bit better since it includes the outliers (the successful movies). 

While there are a lot of outliers here (as seen on the boxplot), these are very imporatnt to include since ultimately when a movie studio makes a movie, they want to be the outlier themselves. (outlier = highly profitable)

In [None]:
movie = movie.drop(labels = ['Budget','WorldwideGross'], axis = 1) # dropping these since our target is a function of these two features

# Ordinal Encoding Ratings

In [None]:
for i in range(movie.shape[0]):
    if (movie['Rating'].loc[i]) == '':
        continue
    elif (movie['Rating'].loc[i]) == 'TV-PG':
        movie['Rating'].loc[i] = 'PG'
    elif (movie['Rating'].loc[i]) == 'Unrated':
        movie['Rating'].loc[i] = 'Not Rated'
    elif (movie['Rating'].loc[i]) == 'TV-14':
        movie['Rating'].loc[i] = 'PG-13'
    elif (movie['Rating'].loc[i]) == 'TV-MA':
        movie['Rating'].loc[i] = 'NC-17'
    elif (movie['Rating'].loc[i]) == 'X':
        movie['Rating'].loc[i] = 'NC-17'
    elif (movie['Rating'].loc[i]) == 'GP':
        movie['Rating'].loc[i] = 'PG'
    elif (movie['Rating'].loc[i]) == 'Approved':
        movie['Rating'].loc[i] = 'R'
        
# ordinal encoding myself

ratings = {'G':0.0, 'PG':1.0, 'PG-13':2.0,'R':3.0,'NC-17':4.0, 'Not Rated':5.0, 'Unknown':6.0} # dictionary mapping
movie['Rating'] = movie['Rating'].replace(ratings) # replace categories with ordinally encoded values

# One-Hot Encoding Genre

In [None]:
movie['Genres'] = movie['Genres'].str.split(' ').str[0] # grab first genre in list and assign this as the movie's single genre
movie = movie.rename(columns = {'Genres':'Genre'}) # rename to singular form
flag = True

In [None]:
if flag:
    flag = False
    enc = OneHotEncoder(handle_unknown='ignore') # one-hot encoder
    enc.fit(np.array(movie['Genre']).reshape(-1,1))

    columns = list(enc.get_feature_names())

    for i in range(len(columns)):
        columns[i] = columns[i].split('_')[-1]

    enc_vals = enc.transform(np.array(movie['Genre']).reshape(-1,1))
    enc_df = pd.DataFrame(enc_vals.toarray(), columns = columns)

    targets = movie['Revenue-Budget-Ratio']
    movie = pd.concat([movie.drop(labels = ['Revenue-Budget-Ratio','Genre'], axis = 1),enc_df,targets], axis = 1)


# Remove additional duplicates that Maria found

In [None]:
movie = movie[~movie.duplicated()]
movie = movie.reset_index(drop = True)

for i in range(movie.shape[0]):
    movie['Title'].loc[i] = re.sub(r'[^\w\s]','', movie['Title'].loc[i]) # strip inconsistent punctuation to make merging easier

movie # this is final, pre-merge movie dataframe

# Merge with cleaned book data from Miranda

In [None]:
book = pd.read_csv('mm_LibraryThingFull.csv')
book = book.drop(labels = ['Link1','Link2'],axis = 1)
book['SearchTerm'] = book['SearchTerm'].str.split(' \[').str[0]

book['Year'] = book['Year'].str.split('[').str[-1].str.split(' film').str[0]
book['author_x'] = book['author_x'].str.split('by ').str[-1]
book = book.rename(columns = {'SearchTerm':'movie_title','Year':'movie_year' ,'AdaptationOf':'book_title','published_x':'book_year','author_x':'author','series_x':'num_books_in_series','rating_x':'book_rating','charnum_x':'charnum','awardnum_x':'awardnum'})

books_and_movies = pd.merge(movie,book, how = 'inner', left_on = 'Title', right_on = 'movie_title')
books_and_movies = books_and_movies.drop_duplicates(subset = ['Title','Year']) # drop any duplicate entries in the dataframe
books_and_movies = books_and_movies.reset_index(drop = True)

test = pd.merge(movie,book, how = 'inner', left_on = 'Title', right_on = 'book_title')
test = test.drop_duplicates(subset = ['Title','Year']) # drop any duplicate entries in the dataframe
test = test.reset_index(drop = True)

books_and_movies

# Analysis of books_and_movies
- Data distribution still representative of overall distribution of movies, as you can tell by the shape of the histogram

In [None]:
books_and_movies = books_and_movies.drop(labels = ['movie_title','book_title','movie_year'], axis = 1)


target = books_and_movies['Revenue-Budget-Ratio']
print(target.describe())
print('')
print('Median:',target.median())

bins = np.arange(0,25,1)
plt.figure(figsize = (10,6))
plt.hist(target, bins = bins)


books_and_movies['book_rating'] = books_and_movies['book_rating'].fillna(books_and_movies['book_rating'].median())
books_and_movies['book_rating'] = (books_and_movies['book_rating'].astype(float) * -1)


# Import NYT Bestsellers

In [None]:
nytimes = pd.read_csv('NYCBestsellers.csv')
best_authors = nytimes['author'].drop_duplicates().reset_index(drop = True)
our_authors = books_and_movies['author'].drop_duplicates().reset_index(drop = True)

# best_authors
# our_authors
test = pd.merge(best_authors,our_authors, how = 'inner')

books_and_movies['NYTBestsellingAuthor'] = 0

for i in range(books_and_movies.shape[0]):
    if (books_and_movies['author'].loc[i] in list(test['author'])):
        books_and_movies['NYTBestsellingAuthor'].loc[i] = 1

# Select things made in or after 2008

In [None]:
books_and_movies = books_and_movies[books_and_movies['Year']>=2008]
books_and_movies

# Feature correlation analysis with target: 

In [None]:
targ = books_and_movies['Revenue-Budget-Ratio']
books_and_movies = pd.concat([targ,books_and_movies.drop(labels = ['Revenue-Budget-Ratio'], axis = 1)],axis = 1)
correlation = books_and_movies.corr()
correlation.iloc[1:,:1]

# Prepare for modeling

In [None]:
targets = books_and_movies['Revenue-Budget-Ratio']
books_and_movies = books_and_movies.drop(labels = ['Title','Description','Keywords','author','Revenue-Budget-Ratio'], axis = 1)

In [None]:
books_and_movies['num_books_in_series'] = books_and_movies['num_books_in_series'].fillna(1)
books_and_movies['charnum'] = books_and_movies['charnum'].fillna(books_and_movies['charnum'].median())
books_and_movies['awardnum'] = books_and_movies['awardnum'].fillna(0)
books_and_movies['book_year'] = books_and_movies['book_year'].fillna(books_and_movies['book_year'].median()) # this is a questionable decision?


# Convert continuous output to categorical output

In [None]:
X = books_and_movies
y = targets

scaler = StandardScaler()
X = scaler.fit_transform(X)


for i in range(len(y)):
    if (y.iloc[i] < 5):
        y.iloc[i] = 0
    else:
        y.iloc[i] = 1
        
print(y.value_counts())

X_train_full, X_test, y_train_full, y_test = train_test_split(X,y,stratify=y, test_size = 0.20)

plt.figure(figsize = (10,6))
plt.bar([0,1],height = [y.value_counts()[0.0],y.value_counts()[1.0]])
plt.xticks([0,1])
plt.title('Class Imbalance Visualization')

# Model creation / validation

In [None]:
def get_average_auroc(X,y,estimator,num_iter):
    
    auroc = []
    
    for n in range(num_iter):
        X_train, X_val, y_train, y_val = train_test_split(X,y,stratify = y,test_size = 0.20)
        
        sm = SMOTE(random_state = 42) # SMOTE
        X_train, y_train = sm.fit_resample(X_train, y_train)
        
        estimator.fit(X_train,y_train)
        preds = estimator.predict(X_val)
        auroc.append(roc_auc_score(y_val,preds))
    
    return np.round(np.mean(auroc),3)

In [None]:
lg = get_average_auroc(X_train_full,y_train_full,LogisticRegression(),100)
print('Average logistic regression AUROC:',lg)

mlp = get_average_auroc(X_train_full,y_train_full,MLPClassifier(),25)
print('Average mlp AUROC:',mlp)

rf = get_average_auroc(X_train_full,y_train_full,RandomForestClassifier(max_depth=4, n_estimators=100),25)
print('Average rf AUROC:',rf)

knn = get_average_auroc(X_train_full,y_train_full,KNeighborsClassifier(),100)
print('Average knn AUROC:',knn)

# We are choosing logistic regression as it is best performing model

# Evaluate final model performance on the test set

In [None]:
auroc = []

final_model = LogisticRegression()
final_model.fit(X_train_full,y_train_full)
preds = final_model.predict(X_test)
auroc.append(roc_auc_score(y_test,preds))
    
print('AUROC on single trial:',np.round(np.mean(auroc),3))

print(classification_report(y_test,preds))

auroc = []
precision = []

for _ in range(150):
    X_train_full, X_test, y_train_full, y_test = train_test_split(X,y,stratify=y, test_size = 0.20)
    final_model = LogisticRegression()
    final_model.fit(X_train_full,y_train_full)
    preds = final_model.predict(X_test)
    auroc.append(roc_auc_score(y_test,preds))
    precision.append(classification_report(y_test,preds, output_dict = True)['1.0']['precision'])

print('Average AUROC over 100 trials:',np.round(np.mean(auroc),3))
print('Average precision for 1.0 class over 100 trials:',np.round(np.mean(precision),3))



# Logistic Regression averaging 0.577 AUROC over 100 trials