# Importing Libraries

In [1]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import numpy as np

# Importing Data

In [2]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [3]:
# select columns for modelling
df_title = df[['title', 'target']]

In [4]:
df_title.reset_index(drop=True, inplace=True)
df_title

Unnamed: 0,title,target
0,Using Code and GPT-3 to Learn Faster,1
1,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
2,How Data Science ACTUALLY Works,1
3,Does Instagram think you live in an influentia...,0
4,Data Analyst MENTORSHIP - Q&A (while I drink ...,0
...,...,...
8499,Data Analyst Resume | Reviewing My Resume! | F...,1
8500,Working at a Big Company Vs Small Company | To...,1
8501,Data Analyst Salary | 100k with No Experience,1
8502,Truth About Big Companies | Told by a Fortune ...,1


# Train Test Split

In [5]:
# seperate feature and target columns
X = df_title['title']

y = df_title['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Feature Engineering

In [6]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# fit vectorizer with training data and transform
X_train_vec = vectorizer.fit_transform(X_train)

# transform test data
X_test_vec = vectorizer.transform(X_test)

# Modelling

In [7]:
rf = RandomForestClassifier()

In [144]:
# rf = RandomForestClassifier()
# rf.fit(X_train_vec, y_train)

In [145]:
# y_train_pred = rf.predict(X_train_vec)
# y_test_pred = rf.predict(X_test_vec)

In [146]:
# apr(y_train_pred, y_train)
# apr(y_test_pred, y_test)

In [147]:
# rf.get_params()

In [148]:
# max_depth = [estimator.tree_.max_depth for estimator in rf.estimators_]
# np.mean(max_depth)

In [149]:
rf_params = {                                                                           # inputting the parameter values
    'max_depth': [25, 27, 30],
    'n_estimators': [150],
    'min_samples_split': [5],
    'min_samples_leaf': [4],
    'max_features': [47, 50, 53]
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5, scoring = 'accuracy')                 # finding the optimal parameter values for the model
                                                                                        # choosing accuracy as the best metric because our train data is class balanced
gs.fit(X_train_vec, y_train)                                            # fitting the train data into the model
gs.best_params_

{'max_depth': 25,
 'max_features': 47,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 150}

In [150]:
y_pred_train = gs.predict(X_train_vec)
y_pred_test = gs.predict(X_test_vec)

In [10]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [152]:
apr(y_pred_train, y_train)

Accuracy:0.7455534323092753
Precision:0.7268266085059978
Recall:0.7852724594992636
F1:0.7549200056633159


In [153]:
apr(y_pred_test, y_test)

Accuracy:0.7095825984714874
Precision:0.6907545164718385
Recall:0.7620164126611958
F1:0.7246376811594204


# Log Loss instead of Gini

In [14]:
rf_params = {                                                                           # inputting the parameter values
    'max_depth': [25, 27, 30, 33],
    'n_estimators': [120, 150],
    'min_samples_split': [4, 5],
    'min_samples_leaf': [3, 4],
    'max_features': [45, 47, 50],
    'criterion': ['log_loss']
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5, scoring = 'accuracy')                 # finding the optimal parameter values for the model
                                                                                        # choosing accuracy as the best metric because our train data is class balanced
gs.fit(X_train_vec, y_train)                                            # fitting the train data into the model
gs.best_params_

{'criterion': 'log_loss',
 'max_depth': 30,
 'max_features': 45,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 150}

In [15]:
y_pred_train = gs.predict(X_train_vec)
y_pred_test = gs.predict(X_test_vec)

In [16]:
apr(y_pred_train, y_train)

Accuracy:0.7626047332059386
Precision:0.7464008859357697
Recall:0.7941089837997054
F1:0.7695161980876267


In [17]:
apr(y_pred_test, y_test)

Accuracy:0.713697824808936
Precision:0.6997816593886463
Recall:0.7514654161781946
F1:0.7247032221594122
