# Import Libraries

In [1]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Importing Data

In [2]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [3]:
df.columns

Index(['video_id', 'channel_id', 'title', 'description', 'tags', 'caption',
       'licensed_content', 'view_count', 'like_count', 'comment_count',
       'channel_name', 'subscribers', 'total_views', 'total_videos',
       'playlist_id', 'category', 'duration_formatted',
       'published_at_formatted', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes'],
      dtype='object')

In [4]:
# select columns for modelling
df_title = df[['title', 'target']]

In [5]:
df_title

Unnamed: 0,title,target
0,Using Code and GPT-3 to Learn Faster,1
1,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
2,How Data Science ACTUALLY Works,1
3,Does Instagram think you live in an influentia...,0
4,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
...,...,...
9317,Data Analyst Resume | Reviewing My Resume! | F...,1
9318,Working at a Big Company Vs Small Company | To...,1
9319,Data Analyst Salary | 100k with No Experience,1
9320,Truth About Big Companies | Told by a Fortune ...,1


In [6]:
df_title.reset_index(drop=True, inplace=True)
df_title

Unnamed: 0,title,target
0,Using Code and GPT-3 to Learn Faster,1
1,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
2,How Data Science ACTUALLY Works,1
3,Does Instagram think you live in an influentia...,0
4,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
...,...,...
8499,Data Analyst Resume | Reviewing My Resume! | F...,1
8500,Working at a Big Company Vs Small Company | To...,1
8501,Data Analyst Salary | 100k with No Experience,1
8502,Truth About Big Companies | Told by a Fortune ...,1


# Train, Test, Split

In [7]:
# seperate feature and target columns
X = df_title['title']

y = df_title['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [10]:
X_train.shape

(6803,)

In [11]:
# check index between X_train and y_train match
(X_train.index == y_train.index).sum()

6803

# Feature Engineering

In [8]:
# create vectorizer object
vectorizer_lr = TfidfVectorizer(stop_words='english', lowercase=True)

# fit vectorizer with training data and transform
X_train_vec_lr = vectorizer_lr.fit_transform(X_train)

# transform test data
X_test_vec_lr = vectorizer_lr.transform(X_test)

In [9]:
# create vectorizer object for Naive Bayes Multinominal
vectorizer_nbm = CountVectorizer()

# fit vectorizer with training data and transform
X_train_vec_nbm = vectorizer_nbm.fit_transform(X_train)

# transform test data
X_test_vec_nbm = vectorizer_nbm.transform(X_test)

In [10]:
# create vectorizer object for Naive Bayes Bernoulli
vectorizer_nbb = CountVectorizer(binary=True)

# fit vectorizer with training data and transform
X_train_vec_nbb = vectorizer_nbb.fit_transform(X_train)

# transform test data
X_test_vec_nbb = vectorizer_nbb.transform(X_test)

# Modelling - Logistic Regression 

In [15]:
# create logistic regression object and fit it with training data
logreg = LogisticRegression()
logreg.fit(X_train_vec_lr, y_train)

In [15]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [17]:
# predict on train data
y_train_pred_lr = logreg.predict(X_train_vec_lr)

# check train data metric scores
apr(y_train_pred_lr, y_train)

Accuracy:0.8243422019697192
Precision:0.8225806451612904
Recall:0.8262150220913107
F1:0.8243938280675973


In [18]:
# predict on test data
y_test_pred_lr = logreg.predict(X_test_vec_lr)

# check test data metric scores
apr(y_test_pred_lr, y_test)

Accuracy:0.7278071722516167
Precision:0.7161862527716186
Recall:0.757327080890973
F1:0.7361823361823362


## Feature Importance

In [19]:
# sanity check
len(logreg.coef_[0]) == len(vectorizer_lr.get_feature_names_out())

True

In [20]:
# vectorized feature names
feature_names = vectorizer_lr.get_feature_names_out()
# corresponding coefficients of feature names above
coef = logreg.coef_[0]

# map feature names to respective coefficients
feature_coef = list(zip(feature_names, coef))
feature_coef[:10]

[('000', -0.6147072562980712),
 ('01', 0.19552374571433892),
 ('02', -0.04358201104157373),
 ('03', 0.32050517505322995),
 ('04', 0.33164561950648613),
 ('05', 0.5162283821572962),
 ('06', 0.24022341026122826),
 ('07', 0.3976957660283289),
 ('08', -0.1702941861200822),
 ('09', 0.14544417255792563)]

In [21]:
# create dictionary from mapped list of features and coefficients
feature_dict = dict(feature_coef)

# select top 10 features by their coef
top_10 = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)[:10])
# select bottom 10 features by their coef
bottom_10 = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)[-10:])

In [22]:
# visualise top 10 features
for key, value in top_10.items():
    print(f'Feature: {key}, Score: {value:.5f}')

Feature: clearly, Score: 2.88747
Feature: hindi, Score: 2.64598
Feature: end, Score: 2.56185
Feature: analyst, Score: 2.15297
Feature: complete, Score: 2.04772
Feature: learn, Score: 2.01902
Feature: deep, Score: 1.95431
Feature: statquest, Score: 1.92681
Feature: iot, Score: 1.92421
Feature: 2022, Score: 1.90396


In [23]:
# visualise bottom 10 features
for key, value in bottom_10.items():
    print(f'Feature: {key}, Score: {value:.5f}')

Feature: coffee, Score: -1.85975
Feature: revelation, Score: -1.96561
Feature: impact, Score: -2.12922
Feature: accountability, Score: -2.23032
Feature: marketing, Score: -2.38663
Feature: mentorship, Score: -2.84838
Feature: study, Score: -3.14528
Feature: bootcamp, Score: -3.19707
Feature: great, Score: -3.46749
Feature: tutorial, Score: -3.53311


## Hyperparameter Tuninig

In [24]:
# create function for optimum feature selection for model
def max_features(low, high, step):

    max_list = list(range(low,(high+1),step))
    train_accuracy = []
    test_accuracy = []

    for x in max_list:
        
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=x)

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        classifier = LogisticRegression()
        classifier.fit(X_train_vec, y_train)

        y_pred_train = classifier.predict(X_train_vec)
        acc = accuracy_score(y_train, y_pred_train)
        train_accuracy.append(acc)

        y_pred_test = classifier.predict(X_test_vec)
        acc2 = accuracy_score(y_test, y_pred_test)
        test_accuracy.append(acc2)

    data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

    max_df = pd.DataFrame(data)

    max_df['diff'] = max_df.train_accuracy - max_df.test_accuracy

    return max_df

In [25]:
# check range of max features between 200 - 1000 incremented by 50s
max_features(200,1000,50)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,200,0.712186,0.688419,0.023767
1,250,0.721299,0.691946,0.029354
2,300,0.727032,0.698413,0.028619
3,350,0.734676,0.696061,0.038615
4,400,0.737616,0.699588,0.038027
5,450,0.743496,0.714286,0.02921
6,500,0.748052,0.712522,0.03553
7,550,0.750845,0.71017,0.040675
8,600,0.756284,0.714874,0.04141
9,650,0.759224,0.712522,0.046702


In [26]:
# check range of max features between 400 - 500 incremented in 10s
max_features(400,500,10)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,400,0.737616,0.699588,0.038027
1,410,0.740703,0.702528,0.038175
2,420,0.742908,0.706055,0.036852
3,430,0.742467,0.706643,0.035823
4,440,0.742614,0.708995,0.033619
5,450,0.743496,0.714286,0.02921
6,460,0.744524,0.708995,0.03553
7,470,0.745553,0.710758,0.034795
8,480,0.747317,0.711934,0.035383
9,490,0.748052,0.711346,0.036706


In [27]:
# check range of max features between 440 - 460 incremented in 1s
max_features(440,460,1)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,440,0.742614,0.708995,0.033619
1,441,0.742761,0.708995,0.033766
2,442,0.743349,0.71017,0.033178
3,443,0.743202,0.71017,0.033031
4,444,0.743496,0.710758,0.032737
5,445,0.743643,0.711934,0.031708
6,446,0.743496,0.711346,0.032149
7,447,0.743349,0.711934,0.031414
8,448,0.74379,0.711934,0.031855
9,449,0.743349,0.714286,0.029063


In [28]:
# find the minimum difference from the most granular test, to mitigate overfitting, by keeping diff in accuracy score below 3%
maxfe_df = max_features(440,460,1)
round(maxfe_df['diff'].min(),6)*100

2.9063

## Tuned Max Features Model

In [11]:
# create vectorizer object
vectorizer_lr = TfidfVectorizer(stop_words='english', lowercase=True, max_features=449)

# fit vectorizer with training data and transform
X_train_vec_lr = vectorizer_lr.fit_transform(X_train)

# transform test data
X_test_vec_lr = vectorizer_lr.transform(X_test)

In [12]:
# create logistic regression object and fit it with training data
logreg_tuned = LogisticRegression()
logreg_tuned.fit(X_train_vec_lr, y_train)

In [16]:
# predict on train data
y_train_pred_lr = logreg_tuned.predict(X_train_vec_lr)

# check train data metric scores
apr(y_train_pred_lr, y_train)

Accuracy:0.7433485227105688
Precision:0.7420017610801292
Recall:0.7446244477172312
F1:0.74331079094384


In [17]:
# predict on test data
y_test_pred_lr = logreg_tuned.predict(X_test_vec_lr)

# check test data metric scores
apr(y_test_pred_lr, y_test)

Accuracy:0.7142857142857143
Precision:0.7078142695356738
Recall:0.7327080890973037
F1:0.7200460829493087


## Pickle Model

In [18]:
with open ('nlp_title_model.pkl', 'wb') as file:
    pickle.dump(logreg, file)

NameError: name 'logreg' is not defined

# Modelling - Naive Bayes (Multinominal)

In [34]:
# train a naive bayes classifier
nb_m = MultinomialNB()
nb_m.fit(X_train_vec_nbm, y_train)

In [35]:
# predict on train data
y_train_pred_nbm = nb_m.predict(X_train_vec_nbm)

# check train data metric scores
apr(y_train_pred_nbm, y_train)

Accuracy:0.819491400852565
Precision:0.7959573886916144
Recall:0.858321060382916
F1:0.8259637188208616


In [36]:
# predict on test data
y_test_pred_nbm = nb_m.predict(X_test_vec_nbm)

# check test data metric scores
apr(y_test_pred_nbm, y_test)

Accuracy:0.7101704879482658
Precision:0.6829268292682927
Recall:0.7878077373974208
F1:0.7316276537833423


## Hyperparameter Tuning

In [37]:
# create function for optimum feature selection for vectorizer
def max_features_nbm(low, high, step):

    max_list = list(range(low,(high+1),step))
    train_accuracy = []
    test_accuracy = []

    for x in max_list:
        
        vectorizer = CountVectorizer(max_features=x)

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        classifier = MultinomialNB()
        classifier.fit(X_train_vec, y_train)

        y_pred_train = classifier.predict(X_train_vec)
        acc = accuracy_score(y_train, y_pred_train)
        train_accuracy.append(acc)

        y_pred_test = classifier.predict(X_test_vec)
        acc2 = accuracy_score(y_test, y_pred_test)
        test_accuracy.append(acc2)

    data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

    max_df = pd.DataFrame(data)

    max_df['diff'] = max_df.train_accuracy - max_df.test_accuracy

    return max_df

In [38]:
max_features_nbm(200,1000,50)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,200,0.694988,0.690182,0.004805
1,250,0.698809,0.687831,0.010979
2,300,0.703072,0.688419,0.014654
3,350,0.706306,0.694297,0.012009
4,400,0.710275,0.699001,0.011274
5,450,0.713509,0.698413,0.015096
6,500,0.719535,0.701352,0.018183
7,550,0.72027,0.699001,0.02127
8,600,0.725268,0.699001,0.026268
9,650,0.728355,0.699588,0.028767


In [39]:
max_features_nbm(450,550,10)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,450,0.713509,0.698413,0.015096
1,460,0.715273,0.698413,0.01686
2,470,0.715714,0.699001,0.016713
3,480,0.717184,0.699588,0.017595
4,490,0.71836,0.701352,0.017007
5,500,0.719535,0.701352,0.018183
6,510,0.719095,0.700764,0.01833
7,520,0.719389,0.700764,0.018624
8,530,0.719535,0.699588,0.019947
9,540,0.720123,0.699588,0.020535


In [40]:
max_features_nbm(480,500,1)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,480,0.717184,0.699588,0.017595
1,481,0.717184,0.699588,0.017595
2,482,0.71689,0.699588,0.017301
3,483,0.717331,0.701352,0.015978
4,484,0.717478,0.701352,0.016125
5,485,0.718066,0.701352,0.016713
6,486,0.718654,0.701352,0.017301
7,487,0.717772,0.701352,0.016419
8,488,0.71836,0.701352,0.017007
9,489,0.718654,0.702528,0.016126


In [60]:
# find the minimum difference from the most granular test, to mitigate overfitting, by keeping diff in accuracy score below 3%
maxfe_df = max_features_nbm(480,500,1)
round(maxfe_df['diff'].min(),6)*100

1.5977999999999999

## Tuned Max Features Model

In [19]:
# create vectorizer object for Naive Bayes Multinominal
vectorizer_nbm = CountVectorizer(max_features=491)

# fit vectorizer with training data and transform
X_train_vec_nbm = vectorizer_nbm.fit_transform(X_train)

# transform test data
X_test_vec_nbm = vectorizer_nbm.transform(X_test)

In [20]:
# train a naive bayes classifier
nb_m_tuned = MultinomialNB()
nb_m_tuned.fit(X_train_vec_nbm, y_train)

In [21]:
# predict on train data
y_train_pred_nbm = nb_m_tuned.predict(X_train_vec_nbm)

# check train data metric scores
apr(y_train_pred_nbm, y_train)

Accuracy:0.7185065412318095
Precision:0.7019650655021834
Recall:0.7575846833578792
F1:0.7287151154554469


In [22]:
# predict on test data
y_test_pred_nbm = nb_m_tuned.predict(X_test_vec_nbm)

# check test data metric scores
apr(y_test_pred_nbm, y_test)

Accuracy:0.702527924750147
Precision:0.686358754027927
Recall:0.7491207502930832
F1:0.7163677130044843


# Modelling - Naive Bayes (Bernoulli)

In [49]:
# train a naive bayes classifier
nb_b = BernoulliNB()
nb_b.fit(X_train_vec_nbb, y_train)

In [50]:
# predict on train data
y_train_pred_nbb = nb_b.predict(X_train_vec_nbb)

# check train data metric scores
apr(y_train_pred_nbb, y_train)

Accuracy:0.8161105394678818
Precision:0.8021420518602029
Recall:0.8382916053019146
F1:0.8198185222526286


In [51]:
# predict on test data
y_test_pred_nbb = nb_b.predict(X_test_vec_nbb)

# check test data metric scores
apr(y_test_pred_nbb, y_test)

Accuracy:0.7148736037624926
Precision:0.6949152542372882
Recall:0.7690504103165299
F1:0.7301057317751808


## Hyperparameter Tuning

In [52]:
# create function for optimum feature selection for vectorizer
def max_features_nbb(low, high, step):

    max_list = list(range(low,(high+1),step))
    train_accuracy = []
    test_accuracy = []

    for x in max_list:
        
        # create vectorizer object for Naive Bayes Bernoulli
        vectorizer_nbb = CountVectorizer(binary=True, max_features=x)

        # fit vectorizer with training data and transform
        X_train_vec_nbb = vectorizer_nbb.fit_transform(X_train)

        # transform test data
        X_test_vec_nbb = vectorizer_nbb.transform(X_test)

        classifier = BernoulliNB()
        classifier.fit(X_train_vec_nbb, y_train)

        y_pred_train = classifier.predict(X_train_vec_nbb)
        acc = accuracy_score(y_train, y_pred_train)
        train_accuracy.append(acc)

        y_pred_test = classifier.predict(X_test_vec_nbb)
        acc2 = accuracy_score(y_test, y_pred_test)
        test_accuracy.append(acc2)

    data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

    max_df = pd.DataFrame(data)

    max_df['diff'] = max_df.train_accuracy - max_df.test_accuracy

    return max_df

In [53]:
max_features_nbb(200,1000,50)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,200,0.694106,0.679012,0.015093
1,250,0.70072,0.680776,0.019944
2,300,0.707482,0.676073,0.031409
3,350,0.711745,0.685479,0.026266
4,400,0.715714,0.69371,0.022004
5,450,0.722328,0.700764,0.021564
6,500,0.726444,0.701352,0.025092
7,550,0.73056,0.701352,0.029208
8,600,0.735264,0.702528,0.032736
9,650,0.735705,0.703704,0.032001


In [56]:
max_features_nbb(400,500,10)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,400,0.715714,0.69371,0.022004
1,410,0.717331,0.695473,0.021857
2,420,0.717478,0.694297,0.02318
3,430,0.719682,0.694885,0.024797
4,440,0.72174,0.697237,0.024503
5,450,0.722328,0.700764,0.021564
6,460,0.722475,0.698413,0.024063
7,470,0.722769,0.700176,0.022593
8,480,0.723063,0.703116,0.019948
9,490,0.724827,0.704292,0.020536


In [57]:
max_features_nbb(470,490,1)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,470,0.722769,0.700176,0.022593
1,471,0.722916,0.700764,0.022152
2,472,0.72321,0.700764,0.022446
3,473,0.723063,0.701352,0.021711
4,474,0.723651,0.703116,0.020536
5,475,0.72321,0.702528,0.020682
6,476,0.72321,0.702528,0.020682
7,477,0.723063,0.703116,0.019948
8,478,0.723063,0.703116,0.019948
9,479,0.723063,0.703116,0.019948


In [59]:
# find the minimum difference from the most granular test, to mitigate overfitting, by keeping diff in accuracy score below 3%
maxfe_df = max_features_nbb(470,490,1)
round(maxfe_df['diff'].min(),6)*100

1.9948000000000001

## Tuned Max Features Model

In [23]:
# create vectorizer object for Naive Bayes Bernoulli
vectorizer_nbb = CountVectorizer(binary=True, max_features=480)

# fit vectorizer with training data and transform
X_train_vec_nbb = vectorizer_nbb.fit_transform(X_train)

# transform test data
X_test_vec_nbb = vectorizer_nbb.transform(X_test)

In [24]:
# train a naive bayes classifier
nb_b_tuned = BernoulliNB()
nb_b_tuned.fit(X_train_vec_nbb, y_train)

In [25]:
# predict on train data
y_train_pred_nbb = nb_b_tuned.predict(X_train_vec_nbb)

# check train data metric scores
apr(y_train_pred_nbb, y_train)

Accuracy:0.7230633544024695
Precision:0.7156722809020839
Recall:0.738438880706922
F1:0.7268773557552914


In [26]:
# predict on test data
y_test_pred_nbb = nb_b_tuned.predict(X_test_vec_nbb)

# check test data metric scores
apr(y_test_pred_nbb, y_test)

Accuracy:0.7031158142269254
Precision:0.6907894736842105
Recall:0.738569753810082
F1:0.7138810198300284


# Ensemble Naive Bayes

In [27]:
# create a df of all predicted probabilities of all models, on train
nbm_train_pred = pd.DataFrame(nb_m_tuned.predict_proba(X_train_vec_nbm))
nbb_train_pred = pd.DataFrame(nb_b_tuned.predict_proba(X_train_vec_nbb))
lr_train_pred = pd.DataFrame(logreg_tuned.predict_proba(X_train_vec_lr))
# numerical_train_pred = pd.DataFrame(numerical_data_model.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble = pd.concat([nbm_train_pred, nbb_train_pred, lr_train_pred], axis=1)

# create ensemble model object
ensemble_clf_title = VotingClassifier(estimators=[
    ('nb_model1',nb_m_tuned),
    ('nb_model2', nb_b_tuned),
    ('lr_model', logreg_tuned)
], voting='soft')  # Use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf_title.fit(X_train_ensemble, y_train)


In [28]:
# create a df of all predicted probabilities of all models, on train
nbm_test_pred = pd.DataFrame(nb_m_tuned.predict_proba(X_test_vec_nbm))
nbb_test_pred = pd.DataFrame(nb_b_tuned.predict_proba(X_test_vec_nbb))
lr_test_pred = pd.DataFrame(logreg_tuned.predict_proba(X_test_vec_lr))
# numerical_train_pred = pd.DataFrame(numerical_data_model.predict_proba(X_train_fe_rob))

# concat all predict_proba dataframes
X_test_ensemble = pd.concat([nbm_test_pred, nbb_test_pred, lr_test_pred], axis=1)

In [29]:
# predict on test data
y_train_pred_ensemble = ensemble_clf_title.predict(X_train_ensemble)

# check train data metric scores
apr(y_train_pred_ensemble, y_train)

Accuracy:0.7426135528443334
Precision:0.7399299474605955
Recall:0.7466863033873343
F1:0.7432927723207742


In [30]:
# predict on test data
y_test_pred_ensemble = ensemble_clf_title.predict(X_test_ensemble)

# check train data metric scores
apr(y_test_pred_ensemble, y_test)

Accuracy:0.7166372721928278
Precision:0.7096045197740113
Recall:0.7362250879249707
F1:0.722669735327963


## Pickle Model

In [31]:
with open ('ensemble_title_model.pkl', 'wb') as file:
    pickle.dump(ensemble_clf_title, file)

In [32]:
with open ('ensemble_title_train.pkl', 'wb') as file:
    pickle.dump(X_train_ensemble, file)

In [33]:
with open ('ensemble_title_test.pkl', 'wb') as file:
    pickle.dump(X_test_ensemble, file)