In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from sklearn.metrics import fbeta_score, make_scorer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import StandardScaler,PowerTransformer,Normalizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, RocCurveDisplay, roc_curve
from sklearn.ensemble import RandomForestClassifier


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('../content/drive/MyDrive/project_3/Data/cleaned_combined_df.csv')

In [4]:
df.head(3)

Unnamed: 0,name,subreddit,title,selftext,upvote_ratio,num_comments,created_utc,year,month,title_len,selftext_len,title_word_count,selftext_word_count,title_selftext,subreddit_title_selftext,title_selftext_len,title_selftext_word_count
0,t3_1brgdb7,1,My friends just went to MK for the first time ...,Quote: “I was disappointed in Gaston’s Tavern....,0.1,3,2024-03-30,2024,3,93,455,19,84,friend magic kingdom first time disappointed r...,__label__1 friend magic kingdom first time dis...,455,43
1,t3_1br7ntk,1,"D23 Parks Bingo sheet, do you think the option...",,0.83,7,2024-03-30,2024,3,67,1,12,0,park bingo sheet think option varying enough,__label__1 park bingo sheet think option varyi...,1,7
2,t3_1bqyshu,1,Olu Mel! 💚,,0.68,0,2024-03-29,2024,3,10,1,3,0,olu mel,__label__1 olu mel,1,2


### train_test_split

In [5]:
X = df['title_selftext']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [6]:
def analyse_model_performance(model,gs_params):

    pipe_gs = GridSearchCV(model, gs_params, cv=5, n_jobs=-1)

    pipe_gs.fit(X_train,y_train)


    # Get predictions
    preds = pipe_gs.predict(X_test)

    # Save confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

    best_params = pipe_gs.best_params_

    dict_scores = {
        'train_accuracy_score': accuracy_score(y_train, pipe_gs.predict(X_train)),
        'test_accuracy_score': accuracy_score(y_test, preds),
        'sensitivity': tp / (tp + fn),
        'specificity': tn / (tn + fp),
        'f1': f1_score(y_test, preds),
        'best Params':best_params

    }
    return dict_scores

In [7]:
# All Model training Combination

In [8]:
scores = []
scores_dict = {}

### Logistic Regression

In [9]:
gs_params = {
    'tvec__ngram_range': [(1, 2)],
    'tvec__max_features': [50000, 53500],
    'Logr__penalty': ['l1', 'l2'],
    'Logr__C': np.logspace(-2, 2, 50),
    'Logr__solver': ['liblinear', 'lbfgs'],
    'Logr__random_state': [123]
}

pipe_logr = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('Logr', LogisticRegression())
])

scores_dict = analyse_model_performance(pipe_logr,gs_params)
scores_dict['model'] ="TF-IDF + Logistic Regression"
scores.append(scores_dict)

500 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _che

### Naive Bayes

In [10]:
gs_params = {
    'tvec__ngram_range': [(1, 2)],
    'tvec__max_features': [50000, 53500],
    'NB__alpha': [0.1, 1.0, 10.0]
}

pipe_NB = Pipeline([
    ('tvec', TfidfVectorizer()),
     ('NB', MultinomialNB())
     ])

scores_dict = analyse_model_performance(pipe_NB,gs_params)
scores_dict['model'] ="TF-IDF + Multinomial NB"
scores.append(scores_dict)

### Support Vector Classifier

In [11]:
gs_params = {
    'tvec__ngram_range': [(1,2)],
     'tvec__max_features': [50000, 53500],
    'svc__kernel': ['linear', 'rbf'],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto']

}


pipe_SVC = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('svc', SVC())
])

scores_dict = analyse_model_performance(pipe_SVC,gs_params)
scores_dict['model'] ="TF-IDF + SVC"
scores.append(scores_dict)

### Ensemble Methods

### Random Forest

In [12]:
gs_params = {
    'tvec__ngram_range': [(1, 2)],
    'tvec__max_features': [50000, 53500],
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}


pipe_SVC = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

scores_dict = analyse_model_performance(pipe_SVC,gs_params)
scores_dict['model'] ="TF-IDF + Random Forest"
scores.append(scores_dict)

In [16]:
gs_params = {
    'tvec__ngram_range': [(1, 2)],
    'tvec__max_features': [50000, 53500],
    'ensemble__logr__penalty': ['l2'],
    'ensemble__logr__C': [18.420699693267146],
    'ensemble__logr__solver': ['liblinear'],
    'ensemble__logr__random_state': [123],
    'ensemble__nb__alpha': [0.1],
    'ensemble__svc__gamma': ['scale'],
    'ensemble__svc__kernel': ['rbf'],
    'ensemble__svc__C': [1]
}

pipe_ensemble = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ensemble', VotingClassifier([
        ('logr', LogisticRegression()),
        ('svc', SVC()),
        ('nb', MultinomialNB())
    ]))
])

scores_dict = analyse_model_performance(pipe_ensemble,gs_params)
scores_dict['model'] ="TF-IDF + Logr + SVC + NB"
scores.append(scores_dict)

In [17]:
df_scores = pd.DataFrame(scores)
df_scores = df_scores.sort_values(by='f1', ascending=False)
df_scores = df_scores.reset_index(drop=True)

model_column = df_scores.pop('model') # Move model from last column to the first
df_scores.insert(0,'model',model_column)
df_scores

Unnamed: 0,model,train_accuracy_score,test_accuracy_score,sensitivity,specificity,f1,best Params
0,TF-IDF + SVC,1.0,0.929648,0.943144,0.916107,0.930693,"{'svc__C': 1, 'svc__gamma': 'scale', 'svc__ker..."
1,TF-IDF + Logr + SVC + NB,1.0,0.929648,0.93311,0.926174,0.93,"{'ensemble__logr__C': 18.420699693267146, 'ens..."
2,TF-IDF + Multinomial NB,0.999282,0.929648,0.899666,0.959732,0.927586,"{'NB__alpha': 0.1, 'tvec__max_features': 50000..."
3,TF-IDF + Logistic Regression,1.0,0.926298,0.923077,0.92953,0.926174,"{'Logr__C': 18.420699693267146, 'Logr__penalty..."
4,TF-IDF + Random Forest,1.0,0.904523,0.946488,0.862416,0.908507,"{'rf__max_depth': None, 'rf__min_samples_leaf'..."


Based on the above tables, ensemble methods consisting of the top 3 models performed the best based on F1 score. However, it is still suffering from overfitting issue where the train accuracy score remain 1.

https://github.com/facebookresearch/fastText/tree/main/python
