# Importing Libraries

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Import Data

In [102]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [103]:
# adjust display options to show all columns
pd.set_option('display.max_columns', None)

In [104]:
df.reset_index(drop=True, inplace=True)

In [105]:
df.shape

(8504, 24)

# Import Models

In [106]:
with open('nlp_title_model.pkl', 'rb') as file:
    nlp_title_model = pickle.load(file)

In [107]:
with open('nlp_description_model.pkl', 'rb') as file:
    nlp_description_model = pickle.load(file)

In [108]:
with open('numerical_data_model.pkl', 'rb') as file:
    numerical_data_model = pickle.load(file)

# Train, Test, Split & Feature Engineering

### Title

In [109]:
# seperate feature and target columns
X = df['title']

y = df['target']

X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X, y, test_size=0.2, random_state=21)

In [110]:
X_train_title.shape

(6803,)

In [111]:
# check index between X_test and y_test match
(X_train_title.index == y_train_title.index).sum()

6803

In [112]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=449)

# fit vectorizer with training data and transform
X_train_vec_title = vectorizer.fit_transform(X_train_title)

# transform test data
X_test_vec_title = vectorizer.transform(X_test_title)

### Description

In [113]:
# seperate feature and target columns
X = df['description']

y = df['target']

X_train_description, X_test_description, y_train_description, y_test_description = train_test_split(X, y, test_size=0.2, random_state=21)

In [114]:
X_train_description.shape

(6803,)

In [115]:
# check index between X_train and y_train match
(X_train_description.index == y_train_description.index).sum()

6803

In [116]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=931)

# fit vectorizer with training data and transform
X_train_vec_des = vectorizer.fit_transform(X_train_description)

# transform test data
X_test_vec_des = vectorizer.transform(X_test_description)

### Numerical Data

In [117]:
# select columns for numerical model
df_numerical = df[['licensed_content', 'subscribers', 'total_views', 'total_videos', 'category', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes']]

In [118]:
# features
X = df.drop(columns=['target'])
# target
y = df['target']

X_train_numerical, X_test_numerical, y_train_numerical, y_test_numerical = train_test_split(X, y, test_size=0.2, random_state=21)

In [119]:
X_train_numerical.shape

(6803, 23)

In [120]:
(X_train_numerical.index == y_train_numerical.index).sum()

6803

In [121]:
# create function to one hot encode the train and test features respectively
def ohe(train, test):

    # map licensed content to binary for train
    train['licensed_map'] = train.licensed_content.apply(lambda x: 1 if x == True else 0)

    # map licensed content to binary for test
    test['licensed_map'] = test.licensed_content.apply(lambda x: 1 if x == True else 0)

    # store unique categories
    unique_categories = set()

    # iterate through unique categories
    for category in train['category']:
        unique_categories.add(category)

    # sort unique categories alphabetically
    unique_categories = sorted(unique_categories)

    # create a binary mapping of all unique categories in train
    for column in unique_categories:
        train[column] = train['category'].apply(lambda x: 1 if column == x else 0)

    # create a binary mapping of all unique categories in test
    for column in unique_categories:
        test[column] = test['category'].apply(lambda x: 1 if column == x else 0)
    
    # drop columns
    train.drop(columns=['category', 'licensed_content'], inplace=True)
    test.drop(columns=['category', 'licensed_content'], inplace=True)

    return train, test

In [122]:
def rob_scaler(train, test, target_train, target_test, columns_to_scale, other_columns):

    # train
    train_scale = train[columns_to_scale] ## columns to scale
    train_not_scale = train[other_columns] ## remaining columns

    rob = RobustScaler() ## scaler object
    rob.fit(train_scale) ## fit the scaler with train data

    train_rob = pd.DataFrame(rob.transform(train_scale), columns=train_scale.columns) ## create df with transformed training data

    train_not_scale.reset_index(drop=True, inplace=True) ## Reset X_train index
    target_train.reset_index(drop=True, inplace=True) ## Reset y_train index

    train_fe_rob = pd.concat([train_not_scale, train_rob], axis=1) ## concatenate scaled data with remaining columns

    # test
    test_scale = test[columns_to_scale] ## columns to scale
    test_not_scale = test[other_columns] ## remaining columns

    test_rob = pd.DataFrame(rob.transform(test_scale), columns=test_scale.columns) ## create df with transformed test data

    test_not_scale.reset_index(drop=True, inplace=True) ## Reset X_test index
    target_test.reset_index(drop=True, inplace=True) ## Reset y_test index

    test_fe_rob = pd.concat([test_not_scale, test_rob], axis=1) ## concatenate scaled data with remaining columns

    return train_fe_rob, test_fe_rob, target_train, target_test

In [123]:
# list of numerical columns to scale
scale_columns = ['subscribers', 'total_views', 'total_videos', 'no_of_tags', 'title_length', 'description_length', 'age', 'duration_minutes']

# list of remaining columns
non_scale_columns = ['licensed_map', 'Education', 'Entertainment', 'Film & Animation', 'Gaming','Howto & Style',
                     'Music', 'People & Blogs', 'Science & Technology', 'Sports', 'Travel & Events']

In [124]:
# one hot encode / map selected categorical features
X_train_fe, X_test_fe = ohe(X_train_numerical, X_test_numerical)

In [125]:
# scale selected continuous features
X_train_fe_rob, X_test_fe_rob, y_train_fe, y_test_fe = rob_scaler(X_train_fe, X_test_fe, y_train_numerical, y_test_numerical, scale_columns, non_scale_columns)

# Ensemble Modelling

In [126]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(nlp_title_model.predict_proba(X_train_vec_title))
description_train_pred = pd.DataFrame(nlp_description_model.predict_proba(X_train_vec_des))
numerical_train_pred = pd.DataFrame(numerical_data_model.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf = VotingClassifier(estimators=[
    ('nlp_model1', nlp_title_model),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', numerical_data_model),
], voting='soft')  # Use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf.fit(X_train_ensemble, y_train_title)


In [127]:
# create a df of all predicted probabilities of all models, on test
title_test_pred = pd.DataFrame(nlp_title_model.predict_proba(X_test_vec_title))
description_test_pred = pd.DataFrame(nlp_description_model.predict_proba(X_test_vec_des))
numerical_test_pred = pd.DataFrame(numerical_data_model.predict_proba(X_test_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble = pd.concat([title_test_pred, description_test_pred, numerical_test_pred], axis=1)

In [128]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [129]:
# predict on train data
y_train_pred_ensemble = ensemble_clf.predict(X_train_ensemble)

# check train data metric scores
apr(y_train_pred_ensemble, y_train_title)

Accuracy:0.8247831838894605
Precision:0.8182606183184051
Recall:0.8341678939617084
F1:0.8261376896149359


In [130]:
# predict on test data
y_test_pred_ensemble = ensemble_clf.predict(X_test_ensemble)

# check test data metric scores
apr(y_test_pred_ensemble, y_test_title)

Accuracy:0.8089359200470312
Precision:0.7972972972972973
Recall:0.8300117233294255
F1:0.813325674899483


# Ensemble with RF Num

In [131]:
with open('numerical_rf.pkl', 'rb') as file:
    num_rf = pickle.load(file)

In [132]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(nlp_title_model.predict_proba(X_train_vec_title))
description_train_pred = pd.DataFrame(nlp_description_model.predict_proba(X_train_vec_des))
numerical_train_pred = pd.DataFrame(num_rf.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf = VotingClassifier(estimators=[
    ('nlp_model1', nlp_title_model),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', num_rf),
], voting='soft')  # Use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf.fit(X_train_ensemble, y_train_title)

In [133]:
# create a df of all predicted probabilities of all models, on test
title_test_pred = pd.DataFrame(nlp_title_model.predict_proba(X_test_vec_title))
description_test_pred = pd.DataFrame(nlp_description_model.predict_proba(X_test_vec_des))
numerical_test_pred = pd.DataFrame(num_rf.predict_proba(X_test_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble = pd.concat([title_test_pred, description_test_pred, numerical_test_pred], axis=1)

In [134]:
# predict on train data
y_train_pred_ensemble = ensemble_clf.predict(X_train_ensemble)

# check train data metric scores
apr(y_train_pred_ensemble, y_train_title)

Accuracy:0.848302219608996
Precision:0.8394139615053146
Recall:0.8606774668630339
F1:0.8499127399650961


In [135]:
# predict on test data
y_test_pred_ensemble = ensemble_clf.predict(X_test_ensemble)

# check test data metric scores
apr(y_test_pred_ensemble, y_test_title)

Accuracy:0.8289241622574955
Precision:0.8122222222222222
Recall:0.8569753810082064
F1:0.8339988590986879


# Ensemble with RF Num and Ensemble Title

In [138]:
with open('ensemble_title_model.pkl', 'rb') as file:
    ensemble_title = pickle.load(file)

In [139]:
with open('ensemble_title_train.pkl', 'rb') as file:
    ensemble_train = pickle.load(file)

In [140]:
with open('ensemble_title_test.pkl', 'rb') as file:
    ensemble_test = pickle.load(file)

In [141]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(ensemble_title.predict_proba(ensemble_train))
description_train_pred = pd.DataFrame(nlp_description_model.predict_proba(X_train_vec_des))
numerical_train_pred = pd.DataFrame(num_rf.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf = VotingClassifier(estimators=[
    ('nlp_model1', ensemble_title),
    ('nlp_model2', nlp_description_model),
    ('numeric_model', num_rf),
], voting='soft')  # Use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf.fit(X_train_ensemble, y_train_title)

In [142]:
# create a df of all predicted probabilities of all models, on test
title_test_pred = pd.DataFrame(ensemble_title.predict_proba(ensemble_test))
description_test_pred = pd.DataFrame(nlp_description_model.predict_proba(X_test_vec_des))
numerical_test_pred = pd.DataFrame(num_rf.predict_proba(X_test_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble = pd.concat([title_test_pred, description_test_pred, numerical_test_pred], axis=1)

In [143]:
# predict on train data
y_train_pred_ensemble = ensemble_clf.predict(X_train_ensemble)

# check train data metric scores
apr(y_train_pred_ensemble, y_train_title)

Accuracy:0.8478612376892547
Precision:0.836950314106225
Recall:0.8633284241531665
F1:0.8499347542409744


In [144]:
# predict on test data
y_test_pred_ensemble = ensemble_clf.predict(X_test_ensemble)

# check test data metric scores
apr(y_test_pred_ensemble, y_test_title)

Accuracy:0.8330393885949442
Precision:0.8157602663706992
Recall:0.861664712778429
F1:0.838084378563284
