# Importing Libraries

In [7]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import numpy as np

# Importing Data

In [8]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [9]:
# select columns for modelling
df_description = df[['description', 'target']]

In [10]:
df_description.reset_index(drop=True, inplace=True)
df_description

Unnamed: 0,description,target
0,Thanks to ProjectPro.io for their support: htt...,1
1,⬇️⬇️⬇️Check here prior to asking your question...,0
2,Check out Deepnote for the easiest way to prac...,1
3,Request this and many other datasets @: https:...,0
4,⬇️⬇️⬇️Check here prior to asking your question...,0
...,...,...
8499,Data Analyst Resume | Reviewing My Resume! | F...,1
8500,Working at a Big Company Vs Small Company | To...,1
8501,Data Analyst Salary | 100k with No Experience ...,1
8502,Truth About Big Companies // There are a ton o...,1


# Train Test Split

In [11]:
# seperate feature and target columns
X = df_description['description']

y = df_description['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Feature Engineering

In [12]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# fit vectorizer with training data and transform
X_train_vec = vectorizer.fit_transform(X_train)

# transform test data
X_test_vec = vectorizer.transform(X_test)

# Modelling

In [7]:
rf = RandomForestClassifier()
rf.fit(X_train_vec, y_train)

In [8]:
y_train_pred = rf.predict(X_train_vec)
y_test_pred = rf.predict(X_test_vec)

In [14]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [11]:
apr(y_train_pred, y_train)
apr(y_test_pred, y_test)

Accuracy:0.9850066147287961
Precision:0.9779390420899855
Recall:0.9923416789396171
F1:0.9850877192982457
Accuracy:0.8201058201058201
Precision:0.8161849710982659
Recall:0.8276670574443142
F1:0.8218859138533178


In [18]:
rf = RandomForestClassifier()

In [23]:
rf_params = {                                                                           # inputting the parameter values
    'max_depth': [30, 35],
    'n_estimators': [120, 150],
    'min_samples_split': [4, 5],
    'min_samples_leaf': [3, 4],
    'max_features': [50, 55, 60]
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5, scoring = 'accuracy')                 # finding the optimal parameter values for the model
                                                                                        # choosing accuracy as the best metric because our train data is class balanced
gs.fit(X_train_vec, y_train)                                            # fitting the train data into the model
gs.best_params_

{'max_depth': 30,
 'max_features': 60,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 120}

In [24]:
y_pred_train = gs.predict(X_train_vec)
y_pred_test = gs.predict(X_test_vec)

In [25]:
apr(y_pred_train, y_train)

Accuracy:0.8253711597824489
Precision:0.8287161155793864
Recall:0.819440353460972
F1:0.8240521327014219


In [26]:
apr(y_pred_test, y_test)

Accuracy:0.8042328042328042
Precision:0.8058823529411765
Recall:0.8030480656506448
F1:0.8044627128596595


# Final Model Pickle

In [13]:
rf_final = RandomForestClassifier(max_depth=30, max_features=60, min_samples_leaf=3, min_samples_split=4, n_estimators=120)
rf_final.fit(X_train_vec, y_train)
y_train_pred = rf_final.predict(X_train_vec)
y_test_pred = rf_final.predict(X_test_vec)

In [15]:
apr(y_train_pred, y_train)
apr(y_test_pred, y_test)

Accuracy:0.8252241658092018
Precision:0.8234604105571848
Recall:0.8270986745213549
F1:0.8252755326965466
Accuracy:0.8077601410934744
Precision:0.8022988505747126
Recall:0.8182883939038686
F1:0.8102147417295414


In [16]:
with open ('nlp_des_rf.pkl', 'wb') as file:
    pickle.dump(rf_final, file)