In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import os

In [2]:
netflix = pd.read_csv('../data/processed/final_df_all_media.csv')
votes = pd.read_csv('../data/interim/showsWithPrincipalVotes.csv')
netflix = netflix.merge(votes[['combined_title', 'director_votes', 'writer_votes', 'actor_votes']], on='combined_title')

In [3]:
from sklearn.model_selection import train_test_split

movies = netflix[~netflix['is_tv_show']]

drop_cols = ['combined_title', 'release_date', 'lower_title', 'season', 'is_tv_show', 'tconst', 'director_rank', 'writer_rank', 'actor_rank']

X_train, X_test, y_train, y_test = train_test_split(movies.drop(drop_cols + ['is_top10'], axis=1), movies['is_top10'], test_size=0.2, random_state=42, stratify=movies['is_top10'])

In [4]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

cvec = CountVectorizer(min_df=0.01)

cvec.fit(train_data['genres'])

train_data = pd.concat([train_data.drop(columns=['genres']), pd.DataFrame(cvec.transform(train_data['genres']).todense(), columns="genre_"+ cvec.get_feature_names_out())], axis=1)
test_data = pd.concat([test_data.drop(columns=['genres']), pd.DataFrame(cvec.transform(test_data['genres']).todense(), columns="genre_"+ cvec.get_feature_names_out())], axis=1)

In [5]:
#Try lemmatizing and BOW
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        
class StemTokenizer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer('english')
    def __call__(self, articles):
        return [self.stemmer.stem(t) for t in word_tokenize(articles)]


tf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(),
                       strip_accents = 'unicode',
                       stop_words = 'english',
                       lowercase = True,
                        min_df=0.015
                    )
tf_vectorizer.fit(train_data['summary'])


train_data = pd.concat([train_data, pd.DataFrame(tf_vectorizer.transform(train_data['summary']).todense(), columns="summary_"+ tf_vectorizer.get_feature_names_out())], axis=1)
test_data = pd.concat([test_data, pd.DataFrame(tf_vectorizer.transform(test_data['summary']).todense(), columns="summary_"+ tf_vectorizer.get_feature_names_out())], axis=1)



In [8]:
save_path = '../models/agModels-predicttop10_votes'

predictor = TabularPredictor(label='is_top10', path=save_path, eval_metric='roc_auc').fit(train_data.drop(columns=['numVotes', 'averageRating']), presets='best_quality', time_limit=3600)
#predictor = TabularPredictor.load(save_path)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "../models/agModels-predicttop10_votes/"
AutoGluon Version:  0.6.0
Python Version:     3.9.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 7 18:08:02 UTC 2019
Train Data Rows:    1192
Train Data Columns: 37
Label Column: is_top10
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...


In [10]:
predictor.feature_metadata.get_features()

['release_year',
 'runtimeMinutes',
 'release_date_quarter',
 'release_date_month',
 'release_date_week',
 'director_rank',
 'writer_rank',
 'actor_rank',
 'genre_action',
 'genre_adventure',
 'genre_animation',
 'genre_biography',
 'genre_comedy',
 'genre_crime',
 'genre_documentary',
 'genre_drama',
 'genre_family',
 'genre_fantasy',
 'genre_fi',
 'genre_history',
 'genre_horror',
 'genre_music',
 'genre_musical',
 'genre_mystery',
 'genre_romance',
 'genre_sci',
 'genre_short',
 'genre_sport',
 'genre_thriller',
 'titleType',
 'nudity',
 'violence',
 'profanity',
 'alcohol',
 'frightening',
 'language',
 'summary',
 'summary.char_count',
 'summary.word_count',
 'summary.capital_ratio',
 'summary.lower_ratio',
 'summary.digit_ratio',
 'summary.special_ratio',
 'summary.symbol_count.?',
 'summary.symbol_ratio.?',
 'summary.symbol_count..',
 'summary.symbol_ratio..',
 'summary.symbol_count.:',
 'summary.symbol_ratio.:',
 'summary.symbol_ratio. ',
 'summary.symbol_count.-',
 'summary.sy

In [7]:
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.840791,0.810295,1.608817,0.322723,26.647933,1.608817,0.322723,26.647933,1,True,7
1,WeightedEnsemble_L2,0.825521,0.823407,3.341426,1.545058,108.311694,0.017689,0.001154,0.599551,2,True,14
2,XGBoost_BAG_L1,0.809777,0.799611,0.443781,0.127034,5.713532,0.443781,0.127034,5.713532,1,True,11
3,LightGBM_BAG_L1,0.807528,0.795923,0.256027,0.133713,14.242337,0.256027,0.133713,14.242337,1,True,4
4,RandomForestGini_BAG_L1,0.801136,0.769365,0.205764,0.213605,0.789561,0.205764,0.213605,0.789561,1,True,5
5,LightGBMLarge_BAG_L1,0.79072,0.78364,0.529327,0.087111,46.846742,0.529327,0.087111,46.846742,1,True,13
6,LightGBMXT_BAG_L1,0.788352,0.768172,7.533465,0.170271,14.967913,7.533465,0.170271,14.967913,1,True,3
7,RandomForestEntr_BAG_L1,0.785926,0.771927,0.170168,0.15701,0.7999,0.170168,0.15701,0.7999,1,True,6
8,NeuralNetFastAI_BAG_L1,0.773911,0.748646,8.763353,1.058547,12.063956,8.763353,1.058547,12.063956,1,True,10
9,ExtraTreesGini_BAG_L1,0.770597,0.752461,0.207429,0.195045,0.689928,0.207429,0.195045,0.689928,1,True,8


In [219]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

def get_results_model(model, X_test=X_test, y_test=y_test, name=""):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test).iloc[:,1]
    print(f"{name} AUC: %0.5f" % roc_auc_score(y_test, y_pred_proba))
    print(f"{name} F1: %0.5f" % f1_score(y_test, y_pred))
    print(f"{name} Accuracy: %0.5f" % accuracy_score(y_test, y_pred))
    return y_pred, y_pred_proba

def get_results_preds(y_labels, y_preds_proba, y_preds,  name=""):

    print(f"{name} AUC: %0.5f" % roc_auc_score(y_labels, y_preds_proba))
    print(f"{name} F1: %0.5f" % f1_score(y_labels, y_preds))
    print(f"{name} Accuracy: %0.5f" % accuracy_score(y_labels, y_preds))

In [221]:
y_pred, y_pred_proba = get_results_model(predictor, test_data, test_data['is_top10'], name="Ensemble Model")

Ensemble Model AUC: 0.80007
Ensemble Model F1: 0.75862
Ensemble Model Accuracy: 0.73370


In [72]:
save_path = "../models/multimodal"
predictor2 = TabularPredictor(label='is_top10', path=save_path, eval_metric='roc_auc', )
predictor2.fit(train_data.drop(columns=['numVotes', 'averageRating']), presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=7, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "../models/multimodal/"
AutoGluon Version:  0.6.0
Python Version:     3.9.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 7 18:08:02 UTC 2019
Train Data Rows:    733
Train Data Columns: 37
Label Column: is_top10
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6311

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2aac3c4ca580>

In [74]:
predictor2.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.840791,0.810295,0.138211,0.582095,114.730787,0.138211,0.582095,114.730787,1,True,7
1,WeightedEnsemble_L2,0.830137,0.82017,16.917832,2.665285,508.584302,0.007935,0.000591,0.630841,2,True,14
2,XGBoost_BAG_L1,0.809777,0.799611,0.15447,0.179758,23.26207,0.15447,0.179758,23.26207,1,True,11
3,LightGBM_BAG_L1,0.807528,0.795923,0.087427,0.147339,42.29684,0.087427,0.147339,42.29684,1,True,4
4,RandomForestGini_BAG_L1,0.801136,0.769365,0.169875,0.166012,0.989191,0.169875,0.166012,0.989191,1,True,5
5,LightGBMLarge_BAG_L1,0.79072,0.78364,0.183929,0.14095,166.475157,0.183929,0.14095,166.475157,1,True,13
6,LightGBMXT_BAG_L1,0.788352,0.768172,0.12191,0.158409,43.211514,0.12191,0.158409,43.211514,1,True,3
7,RandomForestEntr_BAG_L1,0.785926,0.771927,0.250696,0.165139,0.911539,0.250696,0.165139,0.911539,1,True,6
8,NeuralNetFastAI_BAG_L1,0.773911,0.748646,15.654653,0.747402,18.472049,15.654653,0.747402,18.472049,1,True,10
9,ExtraTreesGini_BAG_L1,0.770597,0.752461,0.344684,0.171485,0.807926,0.344684,0.171485,0.807926,1,True,8


In [84]:
#Try lemmatizing and BOW
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        
class StemTokenizer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer('english')
    def __call__(self, articles):
        return [self.stemmer.stem(t) for t in word_tokenize(articles)]


tf_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                       strip_accents = 'unicode',
                       stop_words = 'english',
                       lowercase = True,
                       min_df=0.04
                    )
tf_vectorizer.fit(train_data['summary'])



In [87]:
save_path = "../models/tfidf"
predictor2 = TabularPredictor(label='is_top10', path=save_path, eval_metric='roc_auc', )
predictor2.fit(train_data.drop(columns=['numVotes', 'averageRating']))

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=7, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "../models/tfidf/"
AutoGluon Version:  0.6.0
Python Version:     3.9.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 7 18:08:02 UTC 2019
Train Data Rows:    733
Train Data Columns: 58
Label Column: is_top10
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    62833.39 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2aab7bb83a90>

In [88]:
predictor2.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1,0.845289,0.807208,0.105034,0.542356,119.745394,0.105034,0.542356,119.745394,1,True,7
1,WeightedEnsemble_L2,0.834635,0.821266,2.196087,2.705328,509.994133,0.008023,0.000723,0.639256,2,True,14
2,LightGBM_BAG_L1,0.807528,0.795923,0.214329,0.151193,47.043171,0.214329,0.151193,47.043171,1,True,4
3,XGBoost_BAG_L1,0.805161,0.795633,0.154573,0.13575,23.602797,0.154573,0.13575,23.602797,1,True,11
4,RandomForestGini_BAG_L1,0.794626,0.758487,0.164542,0.166851,0.960546,0.164542,0.166851,0.960546,1,True,5
5,RandomForestEntr_BAG_L1,0.793265,0.765151,0.360969,0.16504,1.003458,0.360969,0.16504,1.003458,1,True,6
6,LightGBMLarge_BAG_L1,0.791075,0.789107,0.149165,0.142471,173.960655,0.149165,0.142471,173.960655,1,True,13
7,LightGBMXT_BAG_L1,0.790009,0.771714,0.138928,0.116605,42.886098,0.138928,0.116605,42.886098,1,True,3
8,ExtraTreesEntr_BAG_L1,0.776278,0.76172,0.62248,0.168473,0.851739,0.62248,0.168473,0.851739,1,True,9
9,NeuralNetFastAI_BAG_L1,0.770478,0.75861,0.811756,0.838664,19.28525,0.811756,0.838664,19.28525,1,True,10
