# Import Libraries

In [1]:
import pickle
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier

# Importing Data

In [2]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

In [3]:
# select columns for modelling
df_description = df[['description', 'target']]

In [4]:
df_description

Unnamed: 0,description,target
0,Thanks to ProjectPro.io for their support: htt...,1
1,⬇️⬇️⬇️Check here prior to asking your question...,0
2,Check out Deepnote for the easiest way to prac...,1
3,Request this and many other datasets @: https:...,0
4,⬇️⬇️⬇️Check here prior to asking your question...,0
...,...,...
9317,Data Analyst Resume | Reviewing My Resume! | F...,1
9318,Working at a Big Company Vs Small Company | To...,1
9319,Data Analyst Salary | 100k with No Experience ...,1
9320,Truth About Big Companies // There are a ton o...,1


In [5]:
df_description.reset_index(drop=True, inplace=True)
df_description

Unnamed: 0,description,target
0,Thanks to ProjectPro.io for their support: htt...,1
1,⬇️⬇️⬇️Check here prior to asking your question...,0
2,Check out Deepnote for the easiest way to prac...,1
3,Request this and many other datasets @: https:...,0
4,⬇️⬇️⬇️Check here prior to asking your question...,0
...,...,...
8499,Data Analyst Resume | Reviewing My Resume! | F...,1
8500,Working at a Big Company Vs Small Company | To...,1
8501,Data Analyst Salary | 100k with No Experience ...,1
8502,Truth About Big Companies // There are a ton o...,1


# Train, Test, Split

In [6]:
# seperate feature and target columns
X = df_description['description']

y = df_description['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [17]:
X_train.shape

(6803,)

In [18]:
# check index between X_train and y_train match
(X_train.index == y_train.index).sum()

6803

# Feature Engineering

In [19]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# fit vectorizer with training data and transform
X_train_vec = vectorizer.fit_transform(X_train)

# transform test data
X_test_vec = vectorizer.transform(X_test)

# Modelling

In [None]:
# create logistic regression object and fit it with training data
logreg = LogisticRegression()
logreg.fit(X_train_vec, y_train)

In [10]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [None]:
# predict on train data
y_train_pred = logreg.predict(X_train_vec)

# check train data metric scores
apr(y_train_pred, y_train)

Accuracy:0.8471262678230193
Precision:0.8452066842568162
Recall:0.8491899852724595
F1:0.847193652659418


In [None]:
# predict on test data
y_test_pred = logreg.predict(X_test_vec)

# check test data metric scores
apr(y_test_pred, y_test)

Accuracy:0.805408583186361
Precision:0.8063380281690141
Recall:0.8053927315357562
F1:0.8058651026392962


# Feature Importance

In [None]:
# sanity check
len(logreg.coef_[0]) == len(vectorizer.get_feature_names_out())

True

In [None]:
# vectorized feature names
feature_names = vectorizer.get_feature_names_out()
# corresponding coefficients of feature names above
coef = logreg.coef_[0]

# map feature names to respective coefficients
feature_coef = list(zip(feature_names, coef))
feature_coef[:10]

[('00', -0.4541414207885582),
 ('000', -0.7188691659538613),
 ('000751', -0.014502166032418159),
 ('0008', 0.018263635385013653),
 ('000hour', 0.016810855466845347),
 ('000k', -0.010371156639616446),
 ('001', 0.2071337502533315),
 ('00179', -0.009822257169768252),
 ('0018', -0.060721004407795875),
 ('002', 0.052621773857123515)]

In [None]:
# create dictionary from mapped list of features and coefficients
feature_dict = dict(feature_coef)

# select top 10 features by their coef
top_10 = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)[:10])
# select bottom 10 features by their coef
bottom_10 = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True)[-10:])

In [None]:
# visualise top 10 features
for key, value in top_10.items():
    print(f'Feature: {key}, Score: {value:.5f}')

Feature: glyt_des_top_sep22, Score: 5.05971
Feature: github, Score: 3.24786
Feature: intro, Score: 2.62331
Feature: statquest, Score: 2.46494
Feature: ambassador_code, Score: 2.23484
Feature: https, Score: 2.20883
Feature: glyt, Score: 2.12270
Feature: mygreatlearning, Score: 1.97753
Feature: arjancodes, Score: 1.96309
Feature: keithgalli, Score: 1.93816


# Hyperparameter Tuninig

In [None]:
# create function for optimum feature selection for model
def max_features(low, high, step):

    max_list = list(range(low,(high+1),step))
    train_accuracy = []
    test_accuracy = []

    for x in max_list:
        
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=x)

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        classifier = LogisticRegression()
        classifier.fit(X_train_vec, y_train)

        y_pred_train = classifier.predict(X_train_vec)
        acc = accuracy_score(y_train, y_pred_train)
        train_accuracy.append(acc)

        y_pred_test = classifier.predict(X_test_vec)
        acc2 = accuracy_score(y_test, y_pred_test)
        test_accuracy.append(acc2)

    data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

    max_df = pd.DataFrame(data)

    max_df['diff'] = max_df.train_accuracy - max_df.test_accuracy

    return max_df

In [None]:
# check range of max features between 200 - 1000 incremented by 50s
max_features(200,1000,50)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,200,0.801411,0.78836,0.013051
1,250,0.809349,0.790711,0.018637
2,300,0.810525,0.794239,0.016286
3,350,0.812583,0.795414,0.017168
4,400,0.815376,0.798942,0.016434
5,450,0.815376,0.797178,0.018197
6,500,0.81567,0.800118,0.015552
7,550,0.816699,0.801881,0.014817
8,600,0.818756,0.803057,0.015699
9,650,0.819197,0.79953,0.019668


In [None]:
# check range of max features between 400 - 500 incremented in 10s
max_features(850,950,10)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,850,0.820814,0.804821,0.015994
1,860,0.821402,0.804233,0.01717
2,870,0.821549,0.805409,0.016141
3,880,0.821402,0.805996,0.015406
4,890,0.821549,0.806584,0.014965
5,900,0.820961,0.805996,0.014965
6,910,0.821108,0.805409,0.0157
7,920,0.82052,0.805409,0.015112
8,930,0.82052,0.804821,0.0157
9,940,0.821402,0.806584,0.014818


In [None]:
# check range of max features between 440 - 460 incremented in 1s
max_features(930,950,1)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,930,0.82052,0.804821,0.0157
1,931,0.821255,0.807172,0.014083
2,932,0.821255,0.806584,0.014671
3,933,0.821402,0.806584,0.014818
4,934,0.821255,0.806584,0.014671
5,935,0.821255,0.806584,0.014671
6,936,0.821255,0.806584,0.014671
7,937,0.821402,0.806584,0.014818
8,938,0.821402,0.806584,0.014818
9,939,0.821402,0.806584,0.014818


In [None]:
# find the minimum difference from the most granular test, to mitigate overfitting, by keeping diff in accuracy score below 3%
maxfe_df = max_features(930,950,1)
round(maxfe_df['diff'].min(),6)*100

KeyboardInterrupt: 

# Tuned Max Features Model

In [7]:
# create vectorizer object
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=931)

# fit vectorizer with training data and transform
X_train_vec = vectorizer.fit_transform(X_train)

# transform test data
X_test_vec = vectorizer.transform(X_test)

In [8]:
# create logistic regression object and fit it with training data
logreg = LogisticRegression()
logreg.fit(X_train_vec, y_train)

In [11]:
# predict on train data
y_train_pred = logreg.predict(X_train_vec)

# check train data metric scores
apr(y_train_pred, y_train)

Accuracy:0.8212553285315302
Precision:0.8126255380200861
Recall:0.8341678939617084
F1:0.8232558139534883


In [12]:
# predict on test data
y_test_pred = logreg.predict(X_test_vec)

# check test data metric scores
apr(y_test_pred, y_test)

Accuracy:0.807172251616696
Precision:0.7959413754227734
Recall:0.8276670574443142
F1:0.8114942528735634


# Picklel Model

In [13]:
with open ('nlp_description_model.pkl', 'wb') as file:
    pickle.dump(logreg, file)

# Ensemble Model

In [14]:
with open('nlp_des_rf.pkl', 'rb') as file:
    rf_model = pickle.load(file)

In [15]:
# create vectorizer object
vectorizer1 = TfidfVectorizer(stop_words='english', lowercase=True)

# fit vectorizer with training data and transform
X_train_vec1 = vectorizer1.fit_transform(X_train)

# transform test data
X_test_vec1 = vectorizer1.transform(X_test)

In [25]:
# create a df of all predicted probabilities of all models, on train
rf_train_pred = pd.DataFrame(rf_model.predict_proba(X_train_vec1))
logreg_train_pred = pd.DataFrame(logreg.predict_proba(X_train_vec))

# concat all predict_proba dataframes
X_train_ensemble = pd.concat([rf_train_pred, logreg_train_pred], axis=1)

# create ensemble model object
ensemble_clf = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('logreg', logreg),
], voting='soft')  # Use 'hard' for majority voting, 'soft' for weighted voting based on probabilities

# fit ensemble model on train data
ensemble_clf.fit(X_train_ensemble, y_train)

In [26]:
# create a df of all predicted probabilities of all models, on test
rf_test_pred = pd.DataFrame(rf_model.predict_proba(X_test_vec1))
logreg_test_pred = pd.DataFrame(logreg.predict_proba(X_test_vec))

# concat all predict_proba dataframes
X_test_ensemble = pd.concat([rf_test_pred, logreg_test_pred], axis=1)

In [27]:
# predict on test data
y_train_pred_ensemble = ensemble_clf.predict(X_train_ensemble)

# check train data metric scores
apr(y_train_pred_ensemble, y_train)

Accuracy:0.8863736586799941
Precision:0.8811046511627907
Recall:0.8927835051546392
F1:0.8869056327724946


In [28]:
# predict on test data
y_test_pred_ensemble = ensemble_clf.predict(X_test_ensemble)

# check train data metric scores
apr(y_test_pred_ensemble, y_test)

Accuracy:0.8030570252792475
Precision:0.8004640371229699
Recall:0.8089097303634232
F1:0.80466472303207
