# Importing Libraries

In [11]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

# Importing Data

In [12]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [13]:
# select columns for numerical model
df = df[['licensed_content', 'subscribers', 'total_views', 'total_videos', 'category', 'no_of_tags', 'title_length',
       'description_length', 'target', 'age', 'duration_minutes']]

# Train Test Split

In [14]:
# features
X = df.drop(columns=['target'])
# target
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Feature Engineering

In [15]:
# create function to one hot encode the train and test features respectively
def ohe(train, test):

    # map licensed content to binary for train
    train['licensed_map'] = train.licensed_content.apply(lambda x: 1 if x == True else 0)

    # map licensed content to binary for test
    test['licensed_map'] = test.licensed_content.apply(lambda x: 1 if x == True else 0)

    # store unique categories
    unique_categories = set()

    # iterate through unique categories
    for category in train['category']:
        unique_categories.add(category)

    # sort unique categories alphabetically
    unique_categories = sorted(unique_categories)

    # create a binary mapping of all unique categories in train
    for column in unique_categories:
        train[column] = train['category'].apply(lambda x: 1 if column == x else 0)

    # create a binary mapping of all unique categories in test
    for column in unique_categories:
        test[column] = test['category'].apply(lambda x: 1 if column == x else 0)
    
    # drop columns
    train.drop(columns=['category', 'licensed_content'], inplace=True)
    test.drop(columns=['category', 'licensed_content'], inplace=True)

    return train, test

In [16]:
def rob_scaler(train, test, target_train, target_test, columns_to_scale, other_columns):

    # train
    train_scale = train[columns_to_scale] ## columns to scale
    train_not_scale = train[other_columns] ## remaining columns

    rob = RobustScaler() ## scaler object
    rob.fit(train_scale) ## fit the scaler with train data

    train_rob = pd.DataFrame(rob.transform(train_scale), columns=train_scale.columns) ## create df with transformed training data

    train_not_scale.reset_index(drop=True, inplace=True) ## Reset X_train index
    target_train.reset_index(drop=True, inplace=True) ## Reset y_train index

    train_fe_rob = pd.concat([train_not_scale, train_rob], axis=1) ## concatenate scaled data with remaining columns

    # test
    test_scale = test[columns_to_scale] ## columns to scale
    test_not_scale = test[other_columns] ## remaining columns

    test_rob = pd.DataFrame(rob.transform(test_scale), columns=test_scale.columns) ## create df with transformed test data

    test_not_scale.reset_index(drop=True, inplace=True) ## Reset X_test index
    target_test.reset_index(drop=True, inplace=True) ## Reset y_test index

    test_fe_rob = pd.concat([test_not_scale, test_rob], axis=1) ## concatenate scaled data with remaining columns

    return train_fe_rob, test_fe_rob, target_train, target_test

In [17]:
# list of numerical columns to scale
scale_columns = ['subscribers', 'total_views', 'total_videos', 'no_of_tags', 'title_length', 'description_length', 'age', 'duration_minutes']

# list of remaining columns
non_scale_columns = ['licensed_map', 'Education', 'Entertainment', 'Film & Animation', 'Gaming','Howto & Style',
                      'Music', 'People & Blogs', 'Science & Technology', 'Sports', 'Travel & Events']

In [18]:
# one hot encode / map selected categorical features
X_train_fe, X_test_fe = ohe(X_train, X_test)

In [19]:
# scale selected continuous features
X_train_fe_rob, X_test_fe_rob, y_train_fe, y_test_fe = rob_scaler(X_train_fe, X_test_fe, y_train, y_test, scale_columns, non_scale_columns)

# Modelling

In [10]:
rf = RandomForestClassifier()
rf.fit(X_train_fe_rob, y_train_fe)

KeyboardInterrupt: 

In [None]:
y_train_pred = rf.predict(X_train_fe_rob)
y_test_pred = rf.predict(X_test_fe_rob)

In [23]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [None]:
apr(y_train_pred, y_train)
apr(y_test_pred, y_test)

Accuracy:0.9998530060267529
Precision:0.9997055359246172
Recall:1.0
F1:0.9998527462818436
Accuracy:0.8324514991181657
Precision:0.8349056603773585
Recall:0.8300117233294255
F1:0.8324514991181657


In [None]:
rf = RandomForestClassifier()

In [None]:
rf_params = {                                                                           # inputting the parameter values
    'max_depth': [7],
    'n_estimators': [220, 225, 230, 235, 240],
    'min_samples_split': [6],
    'min_samples_leaf': [5]
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5, scoring = 'accuracy')                 # finding the optimal parameter values for the model
                                                                                        # choosing accuracy as the best metric because our train data is class balanced
gs.fit(X_train_fe_rob, y_train)                                            # fitting the train data into the model
gs.best_params_

{'max_depth': 7,
 'min_samples_leaf': 5,
 'min_samples_split': 6,
 'n_estimators': 220}

In [None]:
y_pred_train = gs.predict(X_train_fe_rob)
y_pred_test = gs.predict(X_test_fe_rob)

In [None]:
apr(y_pred_train, y_train)

Accuracy:0.8291930030868735
Precision:0.831995242343146
Recall:0.8241531664212076
F1:0.8280556377626516


In [None]:
apr(y_pred_test, y_test)

Accuracy:0.8177542621987066
Precision:0.8117106773823192
Recall:0.8288393903868698
F1:0.8201856148491878


# Final Model

In [28]:
rf = RandomForestClassifier(max_depth=7, min_samples_leaf=5, min_samples_split=6, n_estimators=220)
rf.fit(X_train_fe_rob, y_train_fe)

In [29]:
y_train_pred = rf.predict(X_train_fe_rob)
y_test_pred = rf.predict(X_test_fe_rob)

In [30]:
apr(y_train_pred, y_train)
apr(y_test_pred, y_test)

Accuracy:0.8309569307658387
Precision:0.8312186485688994
Recall:0.8297496318114875
F1:0.8304834905660378
Accuracy:0.8165784832451499
Precision:0.81199538638985
Recall:0.8253223915592028
F1:0.8186046511627907


# Pickling Model

In [31]:
with open ('numerical_rf_model.pkl', 'wb') as file:
    pickle.dump(rf, file)