# Import Libraries

In [2]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Importing Data

In [3]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [4]:
df.columns

Index(['video_id', 'channel_id', 'title', 'description', 'tags', 'caption',
       'licensed_content', 'view_count', 'like_count', 'comment_count',
       'channel_name', 'subscribers', 'total_views', 'total_videos',
       'playlist_id', 'category', 'duration_formatted',
       'published_at_formatted', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes'],
      dtype='object')

In [5]:
# select columns for modelling
df_title = df[['title', 'target']]

In [6]:
df_title

Unnamed: 0,title,target
0,Using Code and GPT-3 to Learn Faster,1
1,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
2,How Data Science ACTUALLY Works,1
3,Does Instagram think you live in an influentia...,0
4,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
...,...,...
9317,Data Analyst Resume | Reviewing My Resume! | F...,1
9318,Working at a Big Company Vs Small Company | To...,1
9319,Data Analyst Salary | 100k with No Experience,1
9320,Truth About Big Companies | Told by a Fortune ...,1


# Train, Test, Split

In [7]:
# seperate feature and target columns
X = df_title['title']

y = df_title['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [8]:
X_train.shape

(6803,)

In [12]:
# check index between X_train and y_train match
(X_train.index == y_train.index).sum()

6803

# Feature Engineering

In [88]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# fit vectorizer with training data and transform
X_train_vec = vectorizer.fit_transform(X_train)

# transform test data
X_test_vec = vectorizer.transform(X_test)

# Modelling

In [84]:
# create logistic regression object and fit it with training data
logreg = LogisticRegression()
logreg.fit(X_train_vec, y_train)

In [85]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [86]:
# predict on train data
y_train_pred = logreg.predict(X_train_vec)

# check train data metric scores
apr(y_train_pred, y_train)

Accuracy:0.8243422019697192
Precision:0.8225806451612904
Recall:0.8262150220913107
F1:0.8243938280675973


In [87]:
# predict on test data
y_test_pred = logreg.predict(X_test_vec)

# check test data metric scores
apr(y_test_pred, y_test)

Accuracy:0.7278071722516167
Precision:0.7161862527716186
Recall:0.757327080890973
F1:0.7361823361823362


# Feature Importance

In [89]:
# sanity check
len(logreg.coef_[0]) == len(vectorizer.get_feature_names_out())

True

In [90]:
# vectorized feature names
feature_names = vectorizer.get_feature_names_out()
# corresponding coefficients of feature names above
coef = logreg.coef_[0]

# map feature names to respective coefficients
feature_coef = list(zip(feature_names, coef))
feature_coef[:10]

[('000', -0.6147072562980712),
 ('01', 0.19552374571433892),
 ('02', -0.04358201104157373),
 ('03', 0.32050517505322995),
 ('04', 0.33164561950648613),
 ('05', 0.5162283821572962),
 ('06', 0.24022341026122826),
 ('07', 0.3976957660283289),
 ('08', -0.1702941861200822),
 ('09', 0.14544417255792563)]

In [30]:
# create dictionary from mapped list of features and coefficients
feature_dict = dict(feature_coef)

# select top 10 features by their coef
top_10 = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)[:10])
# select bottom 10 features by their coef
bottom_10 = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)[-10:])

In [36]:
# visualise top 10 features
for key, value in top_10.items():
    print(f'Feature: {key}, Score: {value:.5f}')

Feature: clearly, Score: 2.88747
Feature: hindi, Score: 2.64598
Feature: end, Score: 2.56185
Feature: analyst, Score: 2.15297
Feature: complete, Score: 2.04772
Feature: learn, Score: 2.01902
Feature: deep, Score: 1.95431
Feature: statquest, Score: 1.92681
Feature: iot, Score: 1.92421
Feature: 2022, Score: 1.90396


In [37]:
# visualise bottom 10 features
for key, value in bottom_10.items():
    print(f'Feature: {key}, Score: {value:.5f}')

Feature: coffee, Score: -1.85975
Feature: revelation, Score: -1.96561
Feature: impact, Score: -2.12922
Feature: accountability, Score: -2.23032
Feature: marketing, Score: -2.38663
Feature: mentorship, Score: -2.84838
Feature: study, Score: -3.14528
Feature: bootcamp, Score: -3.19707
Feature: great, Score: -3.46749
Feature: tutorial, Score: -3.53311


# Hyperparameter Tuninig

In [60]:
# create function for optimum feature selection for model
def max_features(low, high, step):

    max_list = list(range(low,(high+1),step))
    train_accuracy = []
    test_accuracy = []

    for x in max_list:
        
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=x)

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        classifier = LogisticRegression()
        classifier.fit(X_train_vec, y_train)

        y_pred_train = classifier.predict(X_train_vec)
        acc = accuracy_score(y_train, y_pred_train)
        train_accuracy.append(acc)

        y_pred_test = classifier.predict(X_test_vec)
        acc2 = accuracy_score(y_test, y_pred_test)
        test_accuracy.append(acc2)

    data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

    max_df = pd.DataFrame(data)

    max_df['diff'] = max_df.train_accuracy - max_df.test_accuracy

    return max_df

In [61]:
# check range of max features between 200 - 1000 incremented by 50s
max_features(200,1000,50)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,200,0.712186,0.688419,0.023767
1,250,0.721299,0.691946,0.029354
2,300,0.727032,0.698413,0.028619
3,350,0.734676,0.696061,0.038615
4,400,0.737616,0.699588,0.038027
5,450,0.743496,0.714286,0.02921
6,500,0.748052,0.712522,0.03553
7,550,0.750845,0.71017,0.040675
8,600,0.756284,0.714874,0.04141
9,650,0.759224,0.712522,0.046702


In [62]:
# check range of max features between 400 - 500 incremented in 10s
max_features(400,500,10)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,400,0.737616,0.699588,0.038027
1,410,0.740703,0.702528,0.038175
2,420,0.742908,0.706055,0.036852
3,430,0.742467,0.706643,0.035823
4,440,0.742614,0.708995,0.033619
5,450,0.743496,0.714286,0.02921
6,460,0.744524,0.708995,0.03553
7,470,0.745553,0.710758,0.034795
8,480,0.747317,0.711934,0.035383
9,490,0.748052,0.711346,0.036706


In [63]:
# check range of max features between 440 - 460 incremented in 1s
max_features(440,460,1)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,440,0.742614,0.708995,0.033619
1,441,0.742761,0.708995,0.033766
2,442,0.743349,0.71017,0.033178
3,443,0.743202,0.71017,0.033031
4,444,0.743496,0.710758,0.032737
5,445,0.743643,0.711934,0.031708
6,446,0.743496,0.711346,0.032149
7,447,0.743349,0.711934,0.031414
8,448,0.74379,0.711934,0.031855
9,449,0.743349,0.714286,0.029063


In [91]:
# find the minimum difference from the most granular test, to mitigate overfitting, by keeping diff in accuracy score below 3%
maxfe_df = max_features(440,460,1)
round(maxfe_df['diff'].min(),6)*100

2.9063

# Tuned Max Features Model

In [92]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=449)

# fit vectorizer with training data and transform
X_train_vec = vectorizer.fit_transform(X_train)

# transform test data
X_test_vec = vectorizer.transform(X_test)

In [93]:
# create logistic regression object and fit it with training data
logreg = LogisticRegression()
logreg.fit(X_train_vec, y_train)

In [94]:
# predict on train data
y_train_pred = logreg.predict(X_train_vec)

# check train data metric scores
apr(y_train_pred, y_train)

Accuracy:0.7433485227105688
Precision:0.7420017610801292
Recall:0.7446244477172312
F1:0.74331079094384


In [95]:
# predict on test data
y_test_pred = logreg.predict(X_test_vec)

# check test data metric scores
apr(y_test_pred, y_test)

Accuracy:0.7142857142857143
Precision:0.7078142695356738
Recall:0.7327080890973037
F1:0.7200460829493087
