# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate

# Import Data

In [2]:
# import data
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [3]:
# adjust display options to show all columns
pd.set_option('display.max_columns', None)

In [4]:
df.reset_index(drop=True, inplace=True)

In [5]:
df

Unnamed: 0,video_id,channel_id,title,description,tags,caption,licensed_content,view_count,like_count,comment_count,channel_name,subscribers,total_views,total_videos,playlist_id,category,duration_formatted,published_at_formatted,no_of_tags,title_length,description_length,target,age,duration_minutes
0,3fqTNzXY5tg,UCvZnwzmc3m1Eush-Or8Z6DA,Using Code and GPT-3 to Learn Faster,Thanks to ProjectPro.io for their support: htt...,[],True,True,6871,184,23,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 00:18:06,2023-02-19 14:00:02,0,36,795,1,336,18
1,bgVu5WVR9SE,UCvZnwzmc3m1Eush-Or8Z6DA,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],True,True,3723,184,9,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 00:29:22,2022-11-04 03:32:38,0,53,1475,0,444,29
2,4we3smhjAB8,UCvZnwzmc3m1Eush-Or8Z6DA,How Data Science ACTUALLY Works,Check out Deepnote for the easiest way to prac...,[],True,True,85152,2647,136,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 00:26:50,2022-11-01 16:30:09,0,31,1205,1,446,27
3,lpF5SSgczeE,UCvZnwzmc3m1Eush-Or8Z6DA,Does Instagram think you live in an influentia...,Request this and many other datasets @: https:...,[],True,True,4470,158,12,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 01:24:08,2022-10-25 14:00:07,0,81,1059,0,453,84
4,cRVM-LTe3fI,UCvZnwzmc3m1Eush-Or8Z6DA,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],True,True,3078,104,4,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 00:26:55,2022-10-07 03:36:26,0,53,1475,0,472,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8499,4rfr6A3lO-Y,UC7cs8q-gJRlGwj4A8OmCmXg,Data Analyst Resume | Reviewing My Resume! | F...,Data Analyst Resume | Reviewing My Resume! | F...,"[Data Analyst, How to become a data analyst, D...",True,True,66316,1574,64,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg,Education,0 days 00:07:33,2020-01-30 14:07:55,15,69,1875,1,1452,8
8500,OTq2NRy_AGs,UC7cs8q-gJRlGwj4A8OmCmXg,Working at a Big Company Vs Small Company | To...,Working at a Big Company Vs Small Company | To...,"[Data Analyst, How to become a Data Analyst, B...",True,True,14527,397,20,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg,People & Blogs,0 days 00:05:50,2020-01-25 16:38:39,17,78,1548,1,1457,6
8501,ya28cb3zFGE,UC7cs8q-gJRlGwj4A8OmCmXg,Data Analyst Salary | 100k with No Experience,Data Analyst Salary | 100k with No Experience ...,"[Data Analyst Salary, Data analyst with no exp...",True,True,62166,2153,227,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg,People & Blogs,0 days 00:05:03,2020-01-23 03:16:09,14,45,1979,1,1460,5
8502,Hsi2BG0SOiQ,UC7cs8q-gJRlGwj4A8OmCmXg,Truth About Big Companies | Told by a Fortune ...,Truth About Big Companies // There are a ton o...,"[Working at a big company, Big company data an...",True,True,8157,305,18,Alex The Analyst,647000,27519398,270,UU7cs8q-gJRlGwj4A8OmCmXg,People & Blogs,0 days 00:05:45,2020-01-21 03:52:15,7,62,1589,1,1462,6


# Metrics

In [6]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = accuracy_score(y_real, y_pred)
    precision = precision_score(y_real, y_pred)
    recall = recall_score(y_real, y_pred)
    f1 = f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [7]:
def cv_scorer(model, X, y, cv=5):
    # define the scoring functions
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'f1': make_scorer(f1_score)
    }

    # perform cross-validation
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)

    # calculate mean scores across cross-validation folds
    accuracy_mean = cv_results['test_accuracy'].mean()
    precision_mean = cv_results['test_precision'].mean()
    recall_mean = cv_results['test_recall'].mean()
    f1_mean = cv_results['test_f1'].mean()

    # print mean scores
    print(f'Mean Accuracy: {accuracy_mean}')
    print(f'Mean Precision: {precision_mean}')
    print(f'Mean Recall: {recall_mean}')
    print(f'Mean F1: {f1_mean}')

# Retrain Best Model on Entire Data

### NLP Title Model

In [8]:
X_train_title = df['title'].astype(str)

y_train = df['target']

In [9]:
X_train_title.dtype

dtype('O')

In [10]:
X_train_title

0                    Using Code and GPT-3 to Learn Faster
1       Data Analyst MENTORSHIP -  Q&A (while I drink ...
2                         How Data Science ACTUALLY Works
3       Does Instagram think you live in an influentia...
4       Data Analyst MENTORSHIP -  Q&A (while I drink ...
                              ...                        
8499    Data Analyst Resume | Reviewing My Resume! | F...
8500    Working at a Big Company Vs Small Company | To...
8501        Data Analyst Salary | 100k with No Experience
8502    Truth About Big Companies | Told by a Fortune ...
8503                    Top 3 Data Analyst Skills in 2020
Name: title, Length: 8504, dtype: object

In [129]:
# Define the TF-IDF vectorizer pipeline
pipeline_title = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())  
])

# Define the grid of hyperparameters to search over
param_grid = {
    'tfidf__max_features': range(449, 461, 1), 
    'tfidf__stop_words': ['english',],
    'tfidf__lowercase': [True,]
}

# Initialize GridSearchCV
gs_title = GridSearchCV(pipeline_title, param_grid, cv=150, scoring='accuracy', verbose=1)

# Fit GridSearchCV to the data
gs_title.fit(X_train_title, y_train)

# Print best hyperparameters and mean cross-validated score
print("Best Hyperparameters:", gs_title.best_params_)
print("Best Mean Cross-validated Score:", gs_title.best_score_)


Fitting 150 folds for each of 12 candidates, totalling 1800 fits
Best Hyperparameters: {'tfidf__lowercase': True, 'tfidf__max_features': 460, 'tfidf__stop_words': 'english'}
Best Mean Cross-validated Score: 0.6725250626566416


In [125]:
# create vectorizer object
vectorizer_title = TfidfVectorizer(stop_words='english', lowercase=True, max_features=460)

# fit vectorizer with training data and transform
X_train_vec_title = vectorizer_title.fit_transform(X_train_title)

In [159]:
title_model = LogisticRegression()
title_model.fit(X_train_vec_title, y_train)

In [190]:
# predict on train data
y_train_pred_title = title_model.predict(X_train_vec_title)

# check train data metric scores
apr(y_train_pred_title, y_train)

Accuracy:0.7408278457196613
Precision:0.7373432419879238
Recall:0.7474105461393596
F1:0.7423427636193592


In [188]:
cv_scorer(title_model, X_train_vec_title, y_train)

Mean Accuracy: 0.534578898225957
Mean Precision: 0.5626586278601265
Mean Recall: 0.5738291415506132
Mean F1: 0.5374366370091983


In [219]:
cv_scorer(title_model, X_train_title, y_train, 150)

Mean Accuracy: 0.6725250626566416
Mean Precision: 0.6808676245611353
Mean Recall: 0.6788834154351394
Mean F1: 0.6633521596136502


### NLP Description Model

In [24]:
X_train_desc = df['description'].astype(str)

In [25]:
X_train_desc.dtype

dtype('O')

In [26]:
X_train_desc

0       Thanks to ProjectPro.io for their support: htt...
1       ⬇️⬇️⬇️Check here prior to asking your question...
2       Check out Deepnote for the easiest way to prac...
3       Request this and many other datasets @: https:...
4       ⬇️⬇️⬇️Check here prior to asking your question...
                              ...                        
8499    Data Analyst Resume | Reviewing My Resume! | F...
8500    Working at a Big Company Vs Small Company | To...
8501    Data Analyst Salary | 100k with No Experience ...
8502    Truth About Big Companies // There are a ton o...
8503    Top 3 Data Analyst Skills in 2020 // There are...
Name: description, Length: 8504, dtype: object

In [127]:
# Define the TF-IDF vectorizer pipeline
pipeline_desc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())  
])

# Define the grid of hyperparameters to search over
param_grid = {
    'tfidf__max_features': range(900, 1001, 10),  
    'tfidf__stop_words': ['english',],
    'tfidf__lowercase': [True,]
}

# Initialize GridSearchCV
gs_desc = GridSearchCV(pipeline_desc, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV to the data
gs_desc.fit(X_train_desc, y_train)  

# Print best hyperparameters and mean cross-validated score
print("Best Hyperparameters:", gs_desc.best_params_)
print("Best Mean Cross-validated Score:", gs_desc.best_score_)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
Best Hyperparameters: {'tfidf__lowercase': True, 'tfidf__max_features': 910, 'tfidf__stop_words': 'english'}
Best Mean Cross-validated Score: 0.44838655462184873


In [133]:
# create vectorizer object
vectorizer_desc = TfidfVectorizer(stop_words='english', lowercase=True, max_features=910)

# fit vectorizer with training data and transform
X_train_vec_desc = vectorizer_desc.fit_transform(X_train_desc)

In [162]:
desc_model = LogisticRegression()
desc_model.fit(X_train_vec_desc, y_train)

In [191]:
# predict on train data
y_train_pred_desc = desc_model.predict(X_train_vec_desc)

# check train data metric scores
apr(y_train_pred_desc, y_train)

Accuracy:0.8223189087488241
Precision:0.812514272664992
Recall:0.8375706214689266
F1:0.8248522081836096


In [186]:
cv_scorer(desc_model, X_train_vec_desc, y_train)

Mean Accuracy: 0.4318073105785524
Mean Precision: 0.5072302680756426
Mean Recall: 0.345506270352664
Mean F1: 0.3189289966310275


In [174]:
cv_scorer(desc_model, X_train_desc, y_train, 150)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Mean Accuracy: 0.7682999164578111
Mean Precision: 0.7776398745007811
Mean Recall: 0.7772331691297208
Mean F1: 0.7514559582988234


### Numerical Random Forest Model

In [33]:
# create function to one hot encode the train and test features respectively
def ohe(data):

    # map licensed content to binary for train
    data['licensed_map'] = data.licensed_content.apply(lambda x: 1 if x == True else 0)

    # store unique categories
    unique_categories = set()

    # iterate through unique categories
    for category in data['category']:
        unique_categories.add(category)

    # sort unique categories alphabetically
    unique_categories = sorted(unique_categories)

    # create a binary mapping of all unique categories in train
    for column in unique_categories:
        data[column] = data['category'].apply(lambda x: 1 if column == x else 0)
    
    # drop columns
    data.drop(columns=['category', 'licensed_content'], inplace=True)

    return data

In [34]:
# select columns for numerical model
X_train_numerical = df[['licensed_content', 'subscribers', 'total_views', 'total_videos', 'category', 'no_of_tags', 'title_length',
       'description_length', 'age', 'duration_minutes']].copy()

In [35]:
X_train_numerical

Unnamed: 0,licensed_content,subscribers,total_views,total_videos,category,no_of_tags,title_length,description_length,age,duration_minutes
0,True,140000,6562136,152,Education,0,36,795,336,18
1,True,140000,6562136,152,Education,0,53,1475,444,29
2,True,140000,6562136,152,Education,0,31,1205,446,27
3,True,140000,6562136,152,Education,0,81,1059,453,84
4,True,140000,6562136,152,Education,0,53,1475,472,27
...,...,...,...,...,...,...,...,...,...,...
8499,True,647000,27519398,270,Education,15,69,1875,1452,8
8500,True,647000,27519398,270,People & Blogs,17,78,1548,1457,6
8501,True,647000,27519398,270,People & Blogs,14,45,1979,1460,5
8502,True,647000,27519398,270,People & Blogs,7,62,1589,1462,6


In [36]:
X_train_num_fe = ohe(X_train_numerical)

In [37]:
X_train_num_fe

Unnamed: 0,subscribers,total_views,total_videos,no_of_tags,title_length,description_length,age,duration_minutes,licensed_map,Autos & Vehicles,Comedy,Education,Entertainment,Film & Animation,Gaming,Howto & Style,Music,People & Blogs,Science & Technology,Sports,Travel & Events
0,140000,6562136,152,0,36,795,336,18,1,0,0,1,0,0,0,0,0,0,0,0,0
1,140000,6562136,152,0,53,1475,444,29,1,0,0,1,0,0,0,0,0,0,0,0,0
2,140000,6562136,152,0,31,1205,446,27,1,0,0,1,0,0,0,0,0,0,0,0,0
3,140000,6562136,152,0,81,1059,453,84,1,0,0,1,0,0,0,0,0,0,0,0,0
4,140000,6562136,152,0,53,1475,472,27,1,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8499,647000,27519398,270,15,69,1875,1452,8,1,0,0,1,0,0,0,0,0,0,0,0,0
8500,647000,27519398,270,17,78,1548,1457,6,1,0,0,0,0,0,0,0,0,1,0,0,0
8501,647000,27519398,270,14,45,1979,1460,5,1,0,0,0,0,0,0,0,0,1,0,0,0
8502,647000,27519398,270,7,62,1589,1462,6,1,0,0,0,0,0,0,0,0,1,0,0,0


In [38]:
# RobustScaler

# list of numerical columns to scale
scale_columns = ['subscribers', 'total_views', 'total_videos', 'no_of_tags', 'title_length', 'description_length', 'age', 'duration_minutes']

# list of remaining columns
non_scale_columns = ['licensed_map', 'Autos & Vehicles', 'Comedy', 'Education', 'Entertainment', 'Film & Animation', 'Gaming', 'Howto & Style', 'Music',
                     'People & Blogs', 'Science & Technology', 'Sports', 'Travel & Events']

rob = RobustScaler() ## scaler object
rob.fit(X_train_num_fe[scale_columns]) ## fit the scaler with train data

X_train_num_fe_rob = pd.DataFrame(rob.transform(X_train_num_fe[scale_columns]), columns=scale_columns) ## create df with transformed training data

# train_not_scale.reset_index(drop=True, inplace=True) ## Reset X_train index
# target_train.reset_index(drop=True, inplace=True) ## Reset y_train index

X_train_num_fe_rob = pd.concat([X_train_num_fe_rob, X_train_num_fe[non_scale_columns]], axis=1) ## concatenate scaled data with remaining columns

In [39]:
X_train_num_fe_rob.shape

(8504, 21)

In [40]:
y_train.shape

(8504,)

In [41]:
X_train_num_fe_rob

Unnamed: 0,subscribers,total_views,total_videos,no_of_tags,title_length,description_length,age,duration_minutes,licensed_map,Autos & Vehicles,Comedy,Education,Entertainment,Film & Animation,Gaming,Howto & Style,Music,People & Blogs,Science & Technology,Sports,Travel & Events
0,-0.177370,-0.301515,-0.948454,-0.733333,-0.737589,-0.773280,-1.045020,0.118343,1,0,0,1,0,0,0,0,0,0,0,0,0
1,-0.177370,-0.301515,-0.948454,-0.733333,-0.255319,-0.388339,-0.897681,0.378698,1,0,0,1,0,0,0,0,0,0,0,0,0
2,-0.177370,-0.301515,-0.948454,-0.733333,-0.879433,-0.541183,-0.894952,0.331361,1,0,0,1,0,0,0,0,0,0,0,0,0
3,-0.177370,-0.301515,-0.948454,-0.733333,0.539007,-0.623832,-0.885402,1.680473,1,0,0,1,0,0,0,0,0,0,0,0,0
4,-0.177370,-0.301515,-0.948454,-0.733333,-0.255319,-0.388339,-0.859482,0.331361,1,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8499,0.597859,0.084060,-0.867354,0.266667,0.198582,-0.161902,0.477490,-0.118343,1,0,0,1,0,0,0,0,0,0,0,0,0
8500,0.597859,0.084060,-0.867354,0.400000,0.453901,-0.347014,0.484311,-0.165680,1,0,0,0,0,0,0,0,0,1,0,0,0
8501,0.597859,0.084060,-0.867354,0.200000,-0.482270,-0.103029,0.488404,-0.189349,1,0,0,0,0,0,0,0,0,1,0,0,0
8502,0.597859,0.084060,-0.867354,-0.266667,0.000000,-0.323804,0.491132,-0.165680,1,0,0,0,0,0,0,0,0,1,0,0,0


In [168]:
rf = RandomForestClassifier(random_state=21)

# inputting the parameter values
rf_params = {        
    'max_depth': [7, 10, 13],
    'n_estimators': [220, 225, 230, 235, 240],
    'min_samples_split': [3, 4, 6, 8],
    'min_samples_leaf': [3, 5, 7]
}

# finding the optimal parameter values for the model
gs_num = GridSearchCV(rf, param_grid=rf_params, cv=5, scoring = 'accuracy', verbose=1, n_jobs=-1) # choosing accuracy as the best metric because our train data is class balanced

# fitting the train data into the model                                                                                        
gs_num.fit(X_train_num_fe_rob, y_train)                                           

# Print best hyperparameters and mean cross-validated score
print("Best Hyperparameters:", gs_num.best_params_)
print("Best Mean Cross-validated Score:", gs_num.best_score_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Hyperparameters: {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 230}
Best Mean Cross-validated Score: 0.4291136701594218


In [42]:
num_model = RandomForestClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=3,
                                   n_estimators=230, random_state=21)
num_model.fit(X_train_num_fe_rob, y_train)

In [196]:
# predict on train data
y_train_pred_num = num_model.predict(X_train_num_fe_rob)

# check train data metric scores
apr(y_train_pred_num, y_train)

Accuracy:0.8266698024459078
Precision:0.8254340685124355
Recall:0.8281544256120528
F1:0.8267920094007051


In [194]:
cv_scorer(num_model, X_train_num_fe_rob, y_train)

Mean Accuracy: 0.4291136701594218
Mean Precision: 0.4857820488175012
Mean Recall: 0.4215528303194069
Mean F1: 0.40124354144080854


In [172]:
cv_scorer(num_model, X_train_num_fe_rob, y_train, 150)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Mean Accuracy: 0.7676127819548872
Mean Precision: 0.7774109291654838
Mean Recall: 0.7637931034482758
Mean F1: 0.7314917010208706


### Ensemble Model

In [192]:
# create a df of all predicted probabilities of all models, on train
title_train_pred = pd.DataFrame(title_model.predict_proba(X_train_vec_title))
description_train_pred = pd.DataFrame(desc_model.predict_proba(X_train_vec_desc))
numerical_train_pred = pd.DataFrame(num_model.predict_proba(X_train_num_fe_rob))

# concat all predict_proba dataframes
X_train_ensemble = pd.concat([title_train_pred, description_train_pred, numerical_train_pred], axis=1)

# create ensemble model object
ensemble_clf = VotingClassifier(estimators=[
    ('nlp_model1', title_model),
    ('nlp_model2', desc_model),
    ('numeric_model', num_model),
], voting='soft')  # use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf.fit(X_train_ensemble, y_train)

In [197]:
# predict on train data
y_train_pred_ensemble = ensemble_clf.predict(X_train_ensemble)

# check train data metric scores
apr(y_train_pred_num, y_train)

Accuracy:0.8266698024459078
Precision:0.8254340685124355
Recall:0.8281544256120528
F1:0.8267920094007051


In [178]:
cv_scorer(ensemble_clf, X_train_ensemble, y_train)

Mean Accuracy: 0.8183107514610782
Mean Precision: 0.8474987651410834
Mean Recall: 0.8136828102265641
Mean F1: 0.8089009996589922


In [193]:
cv_scorer(ensemble_clf, X_train_ensemble, y_train, 10)

Mean Accuracy: 0.8232570678094975
Mean Precision: 0.8449079139529054
Mean Recall: 0.832980577136515
Mean F1: 0.8201473096894404
