# Import Libraries

In [701]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Importing Data

In [702]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [703]:
# adjust display options to show all columns
pd.set_option('display.max_columns', None)

# Explore numerical and binary columns

In [704]:
df.head(2)

Unnamed: 0,video_id,channel_id,title,description,tags,caption,licensed_content,view_count,like_count,comment_count,channel_name,subscribers,total_views,total_videos,playlist_id,category,duration_formatted,published_at_formatted,no_of_tags,title_length,description_length,target,age,duration_minutes
0,3fqTNzXY5tg,UCvZnwzmc3m1Eush-Or8Z6DA,Using Code and GPT-3 to Learn Faster,Thanks to ProjectPro.io for their support: htt...,[],True,True,6871,184,23,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 00:18:06,2023-02-19 14:00:02,0,36,795,1,336,18
1,bgVu5WVR9SE,UCvZnwzmc3m1Eush-Or8Z6DA,Data Analyst MENTORSHIP - Q&A (while I drink ...,⬇️⬇️⬇️Check here prior to asking your question...,[],True,True,3723,184,9,Shashank Kalanithi,140000,6562136,152,UUvZnwzmc3m1Eush-Or8Z6DA,Education,0 days 00:29:22,2022-11-04 03:32:38,0,53,1475,0,444,29


In [705]:
df.dtypes

video_id                           string
channel_id                         string
title                              string
description                        string
tags                               object
caption                              bool
licensed_content                     bool
view_count                          int64
like_count                          int64
comment_count                       int64
channel_name                       string
subscribers                         int64
total_views                         int64
total_videos                        int64
playlist_id                        string
category                           object
duration_formatted        timedelta64[ns]
published_at_formatted     datetime64[ns]
no_of_tags                          int64
title_length                        int64
description_length                  int64
target                              int64
age                                 int64
duration_minutes                  

In [706]:
df.caption.value_counts()

True    8504
Name: caption, dtype: int64

In [707]:
df.licensed_content.value_counts()

True     4448
False    4056
Name: licensed_content, dtype: int64

In [708]:
df.dtypes

video_id                           string
channel_id                         string
title                              string
description                        string
tags                               object
caption                              bool
licensed_content                     bool
view_count                          int64
like_count                          int64
comment_count                       int64
channel_name                       string
subscribers                         int64
total_views                         int64
total_videos                        int64
playlist_id                        string
category                           object
duration_formatted        timedelta64[ns]
published_at_formatted     datetime64[ns]
no_of_tags                          int64
title_length                        int64
description_length                  int64
target                              int64
age                                 int64
duration_minutes                  

In [709]:
df.columns

Index(['video_id', 'channel_id', 'title', 'description', 'tags', 'caption',
       'licensed_content', 'view_count', 'like_count', 'comment_count',
       'channel_name', 'subscribers', 'total_views', 'total_videos',
       'playlist_id', 'category', 'duration_formatted',
       'published_at_formatted', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes'],
      dtype='object')

In [710]:
# select columns for numerical model
df = df[['licensed_content', 'subscribers', 'total_views', 'total_videos', 'category', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes']]

# Train, Test, Split

In [711]:
# features
X = df.drop(columns=['target'])
# target
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [712]:
X_train.shape

(6803, 10)

In [713]:
(X_train.index == y_train.index).sum()

6803

# Feature Engineering

In [714]:
# create function to one hot encode the train and test features respectively
def ohe(train, test):

    # map licensed content to binary for train
    train['licensed_map'] = train.licensed_content.apply(lambda x: 1 if x == True else 0)

    # map licensed content to binary for test
    test['licensed_map'] = test.licensed_content.apply(lambda x: 1 if x == True else 0)

    # store unique categories
    unique_categories = set()

    # iterate through unique categories
    for category in train['category']:
        unique_categories.add(category)

    # sort unique categories alphabetically
    unique_categories = sorted(unique_categories)

    # create a binary mapping of all unique categories in train
    for column in unique_categories:
        train[column] = train['category'].apply(lambda x: 1 if column == x else 0)

    # create a binary mapping of all unique categories in test
    for column in unique_categories:
        test[column] = test['category'].apply(lambda x: 1 if column == x else 0)
    
    # drop columns
    train.drop(columns=['category', 'licensed_content'], inplace=True)
    test.drop(columns=['category', 'licensed_content'], inplace=True)

    return train, test
    

In [715]:
def rob_scaler(train, test, target_train, target_test, columns_to_scale, other_columns):

    # train
    train_scale = train[columns_to_scale] ## columns to scale
    train_not_scale = train[other_columns] ## remaining columns

    rob = RobustScaler() ## scaler object
    rob.fit(train_scale) ## fit the scaler with train data

    train_rob = pd.DataFrame(rob.transform(train_scale), columns=train_scale.columns) ## create df with transformed training data

    train_not_scale.reset_index(drop=True, inplace=True) ## Reset X_train index
    target_train.reset_index(drop=True, inplace=True) ## Reset y_train index

    train_fe_rob = pd.concat([train_not_scale, train_rob], axis=1) ## concatenate scaled data with remaining columns

    # test
    test_scale = test[columns_to_scale] ## columns to scale
    test_not_scale = test[other_columns] ## remaining columns

    test_rob = pd.DataFrame(rob.transform(test_scale), columns=test_scale.columns) ## create df with transformed test data

    test_not_scale.reset_index(drop=True, inplace=True) ## Reset X_test index
    target_test.reset_index(drop=True, inplace=True) ## Reset y_test index

    test_fe_rob = pd.concat([test_not_scale, test_rob], axis=1) ## concatenate scaled data with remaining columns

    return train_fe_rob, test_fe_rob, target_train, target_test

In [716]:
# list of numerical columns to scale
scale_columns = ['subscribers', 'total_views', 'total_videos', 'no_of_tags', 'title_length', 'description_length', 'age', 'duration_minutes']

# list of remaining columns
non_scale_columns = ['licensed_map', 'Education', 'Entertainment', 'Film & Animation', 'Gaming','Howto & Style',
                      'Music', 'People & Blogs', 'Science & Technology', 'Sports', 'Travel & Events']

In [717]:
# one hot encode / map selected categorical features
X_train_fe, X_test_fe = ohe(X_train, X_test)

In [718]:
# scale selected continuous features
X_train_fe_rob, X_test_fe_rob, y_train_fe, y_test_fe = rob_scaler(X_train_fe, X_test_fe, y_train, y_test, scale_columns, non_scale_columns)

# Modelling

In [719]:
# create model object
lr = LogisticRegression(max_iter=1000)
# fit model with training data
lr.fit(X_train_fe_rob, y_train)

In [720]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [721]:
# predict on train data
y_train_pred = lr.predict(X_train_fe_rob)

# check train data metric scores
apr(y_train_pred, y_train)

Accuracy:0.7674555343230928
Precision:0.7831927522649172
Recall:0.738438880706922
F1:0.7601576713159491


In [722]:
# predict on test data
y_test_pred = lr.predict(X_test_fe_rob)

# check test data metric scores
apr(y_test_pred, y_test)

Accuracy:0.7677836566725456
Precision:0.7785888077858881
Recall:0.7502930832356389
F1:0.7641791044776121


# Pickle Model

In [723]:
with open ('numerical_data_model.pkl', 'wb') as file:
    pickle.dump(lr, file)