In [265]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

test_auc = pd.DataFrame(columns=['Model', 'Test AUC'])
c_importance = pd.DataFrame(index=bst_feature_names)


## <u>Import Datasets and Train Test Split<u>

In [429]:
df = pd.read_csv('/Users/zachwayne/Downloads/South_Korea.csv')
withTags = pd.read_csv('/Users/zachwayne/Dropbox (MIT)/Netflix - Seize the Data/feed_into_pipeline_data_withTags.csv')
summaries = pd.read_csv('/Users/zachwayne/Dropbox (MIT)/Netflix - Seize the Data/GlobalOriginalsWithSummary.csv')[['combined_title', 'summary']]
people_rankings = pd.read_csv('/Users/zachwayne/Dropbox (MIT)/Netflix - Seize the Data/showsWithPrinicpalRankings.csv')[['combined_title', 'director_rank', 'writer_rank', 'actor_rank']]
df = withTags.merge(df[['combined_title', 'cumulative_weeks_in_top_10']], on='combined_title', how='left')

# merge all 3 datasets
df = df.merge(summaries, on='combined_title')
df = df.merge(people_rankings, on='combined_title')
df.drop_duplicates(inplace=True)
df['is_top10'] = df['cumulative_weeks_in_top_10'].apply(lambda x: 1 if x > 0 else 0)
del df['cumulative_weeks_in_top_10']
# split into tv_shows only and sort by combined_title
tv_shows = df[df['is_tv_show']==True]

In [430]:
# remove tv shows that have seasons > 24
tv_shows = tv_shows[tv_shows['season'] < 24]
tv_shows = tv_shows.sort_values(by='combined_title')
tv_shows

# drop combined_title, release_date, lower_title, is_tv_show, tconst
tv_shows = tv_shows.drop(['combined_title', 'release_date', 'lower_title', 'is_tv_show', 'tconst'], axis=1)
tv_shows.shape

(1302, 21)

In [431]:
tv_shows.columns

Index(['release_year', 'is_top10', 'season', 'averageRating', 'numVotes',
       'titleType', 'runtimeMinutes', 'genres', 'release_date_quarter',
       'release_date_month', 'release_date_week', 'nudity', 'violence',
       'profanity', 'alcohol', 'frightening', 'language', 'summary',
       'director_rank', 'writer_rank', 'actor_rank'],
      dtype='object')

In [432]:
# train test split with stratified sampling
from sklearn.model_selection import train_test_split

# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(tv_shows.drop('is_top10', axis=1), tv_shows['is_top10'], test_size=0.2, random_state=42, stratify=tv_shows['is_top10'])

# # split into training and validation sets
# tvshows_X_train, tvshows_X_val, tvshows_y_train, tvshows_y_val = train_test_split(tvshows_X_train, tvshows_y_train, test_size=0.2, random_state=42, stratify=tvshows_y_train)

In [433]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        
class StemTokenizer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer('english')
    def __call__(self, articles):
        return [self.stemmer.stem(t) for t in word_tokenize(articles)]


tf_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                       strip_accents = 'unicode',
                       stop_words = 'english',
                       lowercase = True)

In [434]:
from sklearn.feature_extraction import text

custom_stop_words = []

# add lemmatized stop words to stop_words list

for i in text.ENGLISH_STOP_WORDS:
    custom_stop_words.append(SnowballStemmer('english').stem(i))

In [435]:
# new stop words
new_stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop_words)

## <u>Lemma Tokenizer<u>

### <u>XGBoost : Summary (BoW)<u>

In [436]:
rankings_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),
    #('imputer', IterativeImputer()),
    ('imputer', KNNImputer()),
    ('logger', FunctionTransformer(np.log1p))
])

xgb_transformer = ColumnTransformer(
    [
        ("onehot_titleType",OneHotEncoder(handle_unknown='ignore'), ['titleType']),                 # one-hot encode titleType
        ("onehot_release_year",OneHotEncoder(handle_unknown='ignore'), ['release_year']),           # discrete release year
        ("bagofwords", CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', lowercase=True), 'summary'),              # One hot encoding
        #("tfidf", TfidfVectorizer(stop_words='english', min_df=0.02), 'summary'),                  # TF-IDF
        ("genres_count", CountVectorizer(min_df=0.01), 'genres'),
        #("sentence_transformer", embedder, 'summary'),                                             # hugging face - PCA
        #("imputer", SimpleImputer(strategy='mean'), ['averageRating', 'numVotes', 'runtimeMinutes']),
        ("rankings_pipeline", rankings_pipeline, ['director_rank', 'actor_rank', 'writer_rank']),   # log transform rankings
        ("onehot_v",OneHotEncoder(handle_unknown='ignore'), ['violence']),                          # one-hot encode violence
        ("onehot_n",OneHotEncoder(handle_unknown='ignore'), ['nudity']),                            # one-hot encode nudity
        ("onehot_p",OneHotEncoder(handle_unknown='ignore'), ['profanity']),                         # one-hot encode profanity
        ("onehot_a",OneHotEncoder(handle_unknown='ignore'), ['alcohol']),                           # one-hot encode alcohol
        ("onehot_f",OneHotEncoder(handle_unknown='ignore'), ['frightening']),                       # one-hot encode frightening
        ("onehot_l",OneHotEncoder(min_frequency=0.01, handle_unknown='infrequent_if_exist'), ['language']),      # one-hot encode language
    ],
    remainder='passthrough')

In [437]:
# Recreate pipeline using XGBoost
xgb_pipe = Pipeline(
    steps=[
        ('xgb_transformer', xgb_transformer),
        #('normalizer', Normalizer()),
        ('xgb', XGBClassifier())
    ]
)

xgb_gs_bow_lemma = GridSearchCV(
    xgb_pipe,
    param_grid={
        'xgb__max_depth': [3, 5, 7],
        'xgb__learning_rate': [0.01, 0.05, 0.1],
        'xgb__n_estimators': [100, 200],
        'xgb__subsample': [0.8, 0.85, 0.9],
    },
    scoring={"AUC": "roc_auc", "F1": "f1", "Accuracy": "accuracy"},
    refit="AUC",
    cv=5,
    verbose=0,
    n_jobs=-1,
)

In [438]:
# fit model
xgb_gs_bow_lemma.fit(X_train, y_train)

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.


In [439]:
xgb_gs_bow_lemma.best_params_

{'xgb__learning_rate': 0.01,
 'xgb__max_depth': 5,
 'xgb__n_estimators': 100,
 'xgb__subsample': 0.8}

In [440]:
# save best xgb_gs_bow model as a variable
best_xgb_gs_bow_lemma = xgb_gs_bow_lemma.best_estimator_

# output auc, f1-score, and accuracy on training set
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

# output best score from xgb_gs_bow
print("XGBoost Bag of Words, Best score: %0.5f" % xgb_gs_bow_lemma.best_score_)

# output test set auc, f1-score, and accuracy
print("XGBoost Bag of Words, Test set AUC: %0.5f" % roc_auc_score(y_test, best_xgb_gs_bow_lemma.predict_proba(X_test)[:,1]))
print("XGBoost Bag of Words, Test set F1: %0.5f" % f1_score(y_test, best_xgb_gs_bow_lemma.predict(X_test)))
print("XGBoost Bag of Words, Test set Accuracy: %0.5f" % accuracy_score(y_test, best_xgb_gs_bow_lemma.predict(X_test)))

XGBoost Bag of Words, Best score: 0.77522
XGBoost Bag of Words, Test set AUC: 0.80627
XGBoost Bag of Words, Test set F1: 0.65000
XGBoost Bag of Words, Test set Accuracy: 0.94636


In [441]:
# create df called test AUC

# add France TV = accuracy_score(y_test, best_xgb_gs_bow_lemma.predict(X_test)
test_auc = test_auc.append({'Model': 'South Korea TV', 'Test AUC': roc_auc_score(y_test, best_xgb_gs_bow_lemma.predict_proba(X_test)[:,1])}, ignore_index=True)

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


In [442]:
test_auc

Unnamed: 0,Model,Test AUC
0,Brazil TV,0.563959
1,France TV,0.889143
2,South Africa TV,0.818607
3,US TV,0.78383
4,Australia TV,0.809665
5,South Korea TV,0.806271


In [443]:
def get_feature_names(estimator, numeric_features=['release_year', 'titleType', 'genres', 'summary', 'nudity', 'violence', 'profanity', 'alcohol', 'frightening', 'language', 'director_rank', 'actor_rank', 'writer_rank']):
    """Get feature names from a pipeline or column transformer."""
    feature_names = []
    for transformer in estimator.transformers_:
        try:
            if transformer[0] == 'rankings_pipeline':
                feature_names.append(['director_rank', 'actor_rank', 'writer_rank'])
            elif type(transformer[1]) == CountVectorizer:
                feature_names.append(transformer[2] + "_" + transformer[1].get_feature_names_out())
            else:
                feature_names.append(transformer[1].get_feature_names_out())
        except:
            pass
    feature_names = np.concatenate(feature_names)
    unchanged_features = [feature for feature in estimator.feature_names_in_ if feature not in numeric_features]
    feature_names = np.concatenate([feature_names, unchanged_features])
    return feature_names

In [444]:
bst_feature_names = get_feature_names(best_xgb_gs_bow_lemma['xgb_transformer'])

In [445]:
# create df called feature importance
feature_importance = pd.DataFrame(columns=['Feature', 'Importance'])
# add feature importance to df
feature_importance['Feature'] = bst_feature_names
feature_importance['Importance'] = best_xgb_gs_bow_lemma['xgb'].feature_importances_
#remove columns with feature containts summary
feature_importance_df = feature_importance[~feature_importance['Feature'].str.contains('summary')]
# add importance column to c_importance df


In [446]:
# loop through feature_importance_df and add importance to c_importance df where feature name matches
# create brazil tv column
c_importance['South Korea TV'] = 0
for index, row in feature_importance_df.iterrows():
    c_importance.loc[c_importance.index == row['Feature'], 'South Korea TV'] = row['Importance']
    

In [447]:
# c_importance = delte c_importance['Importance']
c_importance

Unnamed: 0,Brazil TV,France TV,South Africa TV,US TV,Australia TV,South Korea TV
titleType_tvMiniSeries,0.004443,0.006159,0.003342,0.002887,0.003083,0.000000
titleType_tvSeries,0.005783,0.004222,0.000000,0.000000,0.000000,0.000000
titleType_tvSpecial,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
release_year_2020,0.004036,0.007935,0.008606,0.006683,0.007164,0.025923
release_year_2021,0.003679,0.004421,0.008283,0.002516,0.004734,0.010477
...,...,...,...,...,...,...
numVotes,0.004830,0.011005,0.009347,0.005660,0.008184,0.021203
runtimeMinutes,0.004352,0.005627,0.005612,0.003872,0.005244,0.005919
release_date_quarter,0.003310,0.004625,0.004308,0.001878,0.003048,0.008013
release_date_month,0.004458,0.004117,0.004889,0.003204,0.003836,0.004753


In [448]:
# write to csv
c_importance.to_csv('c_importance.csv')

In [159]:
# graph of feature importance
plt.figure(figsize=(10, 10))
# create df of top 10 features with genre in title
top_10_features = xgb_feature_importance_df[xgb_feature_importance_df['feature'].str.contains('language')].sort_values(by='importance', ascending=False).head(10)
#sns.barplot(data=xgb_feature_importance_df.sort_values(by='importance', ascending=False).head(20), x='importance', y='feature')

<Figure size 720x720 with 0 Axes>

In [160]:
top_10_features

Unnamed: 0,feature,importance
5121,language_French,0.011658
5129,language_Spanish,0.011163
5128,language_Portuguese,0.006389
5123,language_Hindi,0.006228
5120,language_English,0.004691
5131,language_infrequent_sklearn,0.004266
2634,summary_language,0.0
5122,language_German,0.0
5124,language_Italian,0.0
5125,language_Japanese,0.0
