# Importing Libraries

In [361]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Import Data

In [362]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [363]:
# adjust display options to show all columns
pd.set_option('display.max_columns', None)

In [364]:
df.reset_index(drop=True, inplace=True)

In [365]:
df.shape

(8504, 24)

# Import Models

In [366]:
# import nlp with logreg model for title column
with open('nlp_title_model.pkl', 'rb') as file:
    nlp_title_model = pickle.load(file)

In [367]:
# import nlp with logreg model for description column
with open('nlp_description_model.pkl', 'rb') as file:
    nlp_description_model = pickle.load(file)

In [368]:
# import logreg model for numerical columns
with open('numerical_data_model.pkl', 'rb') as file:
    numerical_data_model = pickle.load(file)

In [369]:
# import rf classifier model for numerical columns
with open('numerical_rf.pkl', 'rb') as file:
    num_rf = pickle.load(file)

In [370]:
# import nlp with ensemble(nb multinomial, nb bernoulli, logreg) model for title column
with open('ensemble_title_model.pkl', 'rb') as file:
    ensemble_title = pickle.load(file)

In [371]:
# training data for title column ensemble model
with open('ensemble_title_train.pkl', 'rb') as file:
    ensemble_title_train = pickle.load(file)

In [372]:
# test data for title column ensemble model
with open('ensemble_title_test.pkl', 'rb') as file:
    ensemble_title_test = pickle.load(file)

# Train, Test, Split & Feature Engineering

### y_train and y_test for Ensemble

In [373]:
# seperate feature and target columns
X = df.drop(columns=['target'])

y = df['target']

_, _, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

### Title

In [374]:
# seperate feature and target columns
X_title = df['title']

y_title = df['target']

X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X_title, y_title, test_size=0.2, random_state=21)

In [375]:
X_train_title.shape

(6803,)

In [376]:
# check index between X_test and y_test match
(X_train_title.index == y_train_title.index).sum()

6803

In [377]:
# create vectorizer object
vectorizer_title = TfidfVectorizer(stop_words='english', lowercase=True, max_features=449)

# fit vectorizer with training data
vectorizer_title.fit(X_train_title)

# transform train data
X_train_vec_title = vectorizer_title.transform(X_train_title)

# transform test data
X_test_vec_title = vectorizer_title.transform(X_test_title)

### Description

In [378]:
# seperate feature and target columns
X_description = df['description']

y_description = df['target']

X_train_description, X_test_description, y_train_description, y_test_description = train_test_split(X_description, y_description, test_size=0.2, random_state=21)

In [379]:
X_train_description.shape

(6803,)

In [380]:
# check index between X_train and y_train match
(X_train_description.index == y_train_description.index).sum()

6803

In [381]:
# create vectorizer object
vectorizer_description = TfidfVectorizer(stop_words='english', lowercase=True, max_features=931)

# fit vectorizer with training data
vectorizer_description.fit(X_train_description)

# transform train data
X_train_vec_des = vectorizer_description.transform(X_train_description)

# transform test data
X_test_vec_des = vectorizer_description.transform(X_test_description)

### Numerical Data

In [382]:
# select columns for numerical model
df_numerical = df[['licensed_content', 'subscribers', 'total_views', 'total_videos', 'category', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes']]

In [383]:
# seperate features and target columns
X_numerical = df.drop(columns=['target'])

y_numerical = df['target']

X_train_numerical, X_test_numerical, y_train_numerical, y_test_numerical = train_test_split(X_numerical, y_numerical, test_size=0.2, random_state=21)

In [384]:
X_train_numerical.shape

(6803, 23)

In [385]:
(X_train_numerical.index == y_train_numerical.index).sum()

6803

In [386]:
# create function to one hot encode the train and test features respectively
def ohe(train, test):

    # map licensed content to binary for train
    train['licensed_map'] = train.licensed_content.apply(lambda x: 1 if x == True else 0)

    # map licensed content to binary for test
    test['licensed_map'] = test.licensed_content.apply(lambda x: 1 if x == True else 0)

    # store unique categories
    unique_categories = set()

    # iterate through unique categories
    for category in train['category']:
        unique_categories.add(category)

    # sort unique categories alphabetically
    unique_categories = sorted(unique_categories)

    # create a binary mapping of all unique categories in train
    for column in unique_categories:
        train[column] = train['category'].apply(lambda x: 1 if column == x else 0)

    # create a binary mapping of all unique categories in test
    for column in unique_categories:
        test[column] = test['category'].apply(lambda x: 1 if column == x else 0)
    
    # drop columns
    train.drop(columns=['category', 'licensed_content'], inplace=True)
    test.drop(columns=['category', 'licensed_content'], inplace=True)

    return train, test

In [387]:
def rob_scaler(train, test, target_train, target_test, columns_to_scale, other_columns):

    # train
    train_scale = train[columns_to_scale] ## columns to scale
    train_not_scale = train[other_columns] ## remaining columns

    rob = RobustScaler() ## scaler object
    rob.fit(train_scale) ## fit the scaler with train data

    train_rob = pd.DataFrame(rob.transform(train_scale), columns=train_scale.columns) ## create df with transformed training data

    train_not_scale.reset_index(drop=True, inplace=True) ## Reset X_train index
    target_train.reset_index(drop=True, inplace=True) ## Reset y_train index

    train_fe_rob = pd.concat([train_not_scale, train_rob], axis=1) ## concatenate scaled data with remaining columns

    # test
    test_scale = test[columns_to_scale] ## columns to scale
    test_not_scale = test[other_columns] ## remaining columns

    test_rob = pd.DataFrame(rob.transform(test_scale), columns=test_scale.columns) ## create df with transformed test data

    test_not_scale.reset_index(drop=True, inplace=True) ## Reset X_test index
    target_test.reset_index(drop=True, inplace=True) ## Reset y_test index

    test_fe_rob = pd.concat([test_not_scale, test_rob], axis=1) ## concatenate scaled data with remaining columns

    return train_fe_rob, test_fe_rob, target_train, target_test

In [388]:
# list of numerical columns to scale
scale_columns = ['subscribers', 'total_views', 'total_videos', 'no_of_tags', 'title_length', 'description_length', 'age', 'duration_minutes']

# list of remaining columns
non_scale_columns = ['licensed_map', 'Education', 'Entertainment', 'Film & Animation', 'Gaming','Howto & Style',
                     'Music', 'People & Blogs', 'Science & Technology', 'Sports', 'Travel & Events']

In [389]:
# one hot encode / map selected categorical features
X_train_fe, X_test_fe = ohe(X_train_numerical, X_test_numerical)

In [390]:
# scale selected continuous features
X_train_fe_rob, X_test_fe_rob, y_train_fe, y_test_fe = rob_scaler(X_train_fe, X_test_fe, y_train_numerical, y_test_numerical, scale_columns, non_scale_columns)

# Ensemble Modelling

In [391]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = accuracy_score(y_real, y_pred)
    precision = precision_score(y_real, y_pred)
    recall = recall_score(y_real, y_pred)
    f1 = f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [392]:
# function to calculate cross-validation scoring for all discrete metrics
def cv_scorer(model, X_train, X_test, y_train, y_test):

    # define the scoring functions
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'f1': make_scorer(f1_score)
    }

    # concat train and test splits back for cross-validation
    X_all = pd.concat([X_train, X_test])

    y_all = pd.concat([y_train, y_test])

    # perform cross-validation
    cv_results = cross_validate(model, X_all, y_all, cv=5, scoring=scoring)

    # calculate mean scores across cross-validation folds
    accuracy_mean = cv_results['test_accuracy'].mean()
    precision_mean = cv_results['test_precision'].mean()
    recall_mean = cv_results['test_recall'].mean()
    f1_mean = cv_results['test_f1'].mean()

    # print mean scores
    print(f'Mean Accuracy: {accuracy_mean}')
    print(f'Mean Precision: {precision_mean}')
    print(f'Mean Recall: {recall_mean}')
    print(f'Mean F1: {f1_mean}')

In [393]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(nlp_title_model.predict_proba(X_train_vec_title))
description_train_pred = pd.DataFrame(nlp_description_model.predict_proba(X_train_vec_des))
numerical_train_pred = pd.DataFrame(numerical_data_model.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble_1 = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf_1 = VotingClassifier(estimators=[
    ('nlp_model1', nlp_title_model),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', numerical_data_model),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf_1.fit(X_train_ensemble_1, y_train)


In [394]:
# create a df of all predicted probabilities of all models, on test
title_test_pred = pd.DataFrame(nlp_title_model.predict_proba(X_test_vec_title))
description_test_pred = pd.DataFrame(nlp_description_model.predict_proba(X_test_vec_des))
numerical_test_pred = pd.DataFrame(numerical_data_model.predict_proba(X_test_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble_1 = pd.concat([title_test_pred, description_test_pred, numerical_test_pred], axis=1)

In [395]:
# predict on train data
y_train_pred_ensemble_1 = ensemble_clf_1.predict(X_train_ensemble_1)

# check train data metric scores
apr(y_train_pred_ensemble_1, y_train)

Accuracy:0.8247831838894605
Precision:0.8182606183184051
Recall:0.8341678939617084
F1:0.8261376896149359


In [396]:
# predict on test data
y_test_pred_ensemble_1 = ensemble_clf_1.predict(X_test_ensemble_1)

# check test data metric scores
apr(y_test_pred_ensemble_1, y_test)

Accuracy:0.8089359200470312
Precision:0.7972972972972973
Recall:0.8300117233294255
F1:0.813325674899483


In [397]:
# calculate overfitting
of_1 = accuracy_score(y_train, y_train_pred_ensemble_1) - accuracy_score(y_test, y_test_pred_ensemble_1)
of_1

0.015847263842429293

In [398]:
# create ensemble model object
ensemble_clf_1_cv = VotingClassifier(estimators=[
    ('nlp_model1', nlp_title_model),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', numerical_data_model),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# cross-validation scores
cv_scorer(ensemble_clf_1_cv, X_train_ensemble_1, X_test_ensemble_1, y_train, y_test)

Mean Accuracy: 0.8210239651416122
Mean Precision: 0.8136442044544803
Mean Recall: 0.8326266195524145
Mean F1: 0.8229686323379146


# Ensemble with RF Num

In [399]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(nlp_title_model.predict_proba(X_train_vec_title))
description_train_pred = pd.DataFrame(nlp_description_model.predict_proba(X_train_vec_des))
numerical_train_pred = pd.DataFrame(num_rf.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble_2 = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf_2 = VotingClassifier(estimators=[
    ('nlp_model1', nlp_title_model),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', num_rf),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf_2.fit(X_train_ensemble_2, y_train)

In [400]:
# create a df of all predicted probabilities of all models, on test
title_test_pred = pd.DataFrame(nlp_title_model.predict_proba(X_test_vec_title))
description_test_pred = pd.DataFrame(nlp_description_model.predict_proba(X_test_vec_des))
numerical_test_pred = pd.DataFrame(num_rf.predict_proba(X_test_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble_2 = pd.concat([title_test_pred, description_test_pred, numerical_test_pred], axis=1)

In [401]:
# predict on train data
y_train_pred_ensemble_2 = ensemble_clf_2.predict(X_train_ensemble_2)

# check train data metric scores
apr(y_train_pred_ensemble_2, y_train)

Accuracy:0.8475672497427605
Precision:0.8403579676674365
Recall:0.8574374079528718
F1:0.848811780142878


In [402]:
# predict on test data
y_test_pred_ensemble_2 = ensemble_clf_2.predict(X_test_ensemble_2)

# check test data metric scores
apr(y_test_pred_ensemble_2, y_test)

Accuracy:0.8324514991181657
Precision:0.8169642857142857
Recall:0.8581477139507621
F1:0.8370497427101201


In [403]:
of_2 = accuracy_score(y_train, y_train_pred_ensemble_2) - accuracy_score(y_test, y_test_pred_ensemble_2)
of_2

0.015115750624594804

In [404]:
# create ensemble model object
ensemble_clf_2_cv = VotingClassifier(estimators=[
    ('nlp_model1', nlp_title_model),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', num_rf),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# cross-validation scores
cv_scorer(ensemble_clf_2_cv, X_train_ensemble_2, X_test_ensemble_2, y_train, y_test)

Mean Accuracy: 0.8405446623093683
Mean Precision: 0.8326147710845646
Mean Recall: 0.8524013025704982
Mean F1: 0.8423203810104841


# Ensemble with RF Num and Ensemble Title

In [405]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(ensemble_title.predict_proba(ensemble_title_train))
description_train_pred = pd.DataFrame(nlp_description_model.predict_proba(X_train_vec_des))
numerical_train_pred = pd.DataFrame(num_rf.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble_3 = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf_3 = VotingClassifier(estimators=[
    ('nlp_model1', ensemble_title),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', num_rf),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf_3.fit(X_train_ensemble_3, y_train)

In [406]:
# create a df of all predicted probabilities of all models, on test
title_test_pred = pd.DataFrame(ensemble_title.predict_proba(ensemble_title_test))
description_test_pred = pd.DataFrame(nlp_description_model.predict_proba(X_test_vec_des))
numerical_test_pred = pd.DataFrame(num_rf.predict_proba(X_test_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble_3 = pd.concat([title_test_pred, description_test_pred, numerical_test_pred], axis=1)

In [407]:
# predict on train data
y_train_pred_ensemble_3 = ensemble_clf_3.predict(X_train_ensemble_3)

# check train data metric scores
apr(y_train_pred_ensemble_3, y_train)

Accuracy:0.8468322798765251
Precision:0.8383664078228358
Recall:0.8586156111929307
F1:0.8483701979045402


In [408]:
# predict on test data
y_test_pred_ensemble_3 = ensemble_clf_3.predict(X_test_ensemble_3)

# check test data metric scores
apr(y_test_pred_ensemble_3, y_test)

Accuracy:0.831275720164609
Precision:0.8158482142857143
Recall:0.8569753810082064
F1:0.8359062321326473


In [409]:
of_3 = accuracy_score(y_train, y_train_pred_ensemble_3) - accuracy_score(y_test, y_test_pred_ensemble_3)
of_3

0.015556559711916051

In [410]:
# create ensemble model object
ensemble_clf_3_cv = VotingClassifier(estimators=[
    ('nlp_model1', ensemble_title),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', num_rf),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# cross-validation scores
cv_scorer(ensemble_clf_3_cv, X_train_ensemble_3, X_test_ensemble_3, y_train, y_test)

Mean Accuracy: 0.8378401632257841
Mean Precision: 0.8282475227664874
Mean Recall: 0.8523999168572022
Mean F1: 0.8400695124364639


# Pickle Best Model

In [412]:
cv_scorer(ensemble_clf_1_cv, X_train_ensemble_1, X_test_ensemble_1, y_train, y_test)

Mean Accuracy: 0.8210239651416122
Mean Precision: 0.8136442044544803
Mean Recall: 0.8326266195524145
Mean F1: 0.8229686323379146


In [413]:
cv_scorer(ensemble_clf_2_cv, X_train_ensemble_2, X_test_ensemble_2, y_train, y_test)

Mean Accuracy: 0.8405446623093683
Mean Precision: 0.8326147710845646
Mean Recall: 0.8524013025704982
Mean F1: 0.8423203810104841


In [358]:
cv_scorer(ensemble_clf_3_cv, X_train_ensemble_3, X_test_ensemble_3, y_train, y_test)

Mean Accuracy: 0.8378401632257841
Mean Precision: 0.8282475227664874
Mean Recall: 0.8523999168572022
Mean F1: 0.8400695124364639


In [414]:
# compare overfitting
print(of_1)
print(of_2)
print(of_3)

0.015847263842429293
0.015115750624594804
0.015556559711916051


In [126]:
# save best ensemble model as pkl file for deployment
# with open('ensemble_clf_2.pkl', 'wb') as file:
#     pickle.dump(ensemble_clf_3, file)