TFID is performing OK but only at 85 percent accuracy while BOW is at 89 percent with better precision,recall, and accuracy

In [1]:
#with 1000 max_feaatures and tfidf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [3]:
# Separate features and target
X_cleaned = df['clean_comment']
y_cleaned = df['category']

In [4]:
# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned,X_test_cleaned,y_train_cleaned,y_test_cleaned = train_test_split(
    X_cleaned,y_cleaned,test_size=0.2,random_state=42
)

In [5]:
# Apply TFIDF with trigram setting and max_features=5000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1,3),max_features=1000)

In [6]:
# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned).astype('float32')
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned).astype('float32')

In [7]:
! pip install optuna

In [8]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import classification_report,accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# function to optimize LightGbm hyperparameter
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        "objective": "multiclass",
        "num_class":3,
        "learning_rate": trial.suggest_float("learning_rate",1e-3, 1e-1),
        "n_estimators" : trial.suggest_int("n_estimators",50,500),
        "min_child_samples": trial.suggest_int('min_child_samples', 10, 200),
        "reg_lambda" : trial.suggest_float('reg_lambda', 1e-4, 50.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    #Define the Lightgbm parameter with the trial parameter
    model = lgb.LGBMClassifier(**param,verbose =-1)

    #perform a cross validation
    scores =cross_val_score(model,X_train_tfidf_cleaned,y_train_cleaned,cv=3,scoring='accuracy')

    # Return the average score across folds
    return scores.mean()



In [13]:
# Create an Optuna study to optimize the hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-11-19 07:38:24,160] A new study created in memory with name: no-name-b835afe3-d130-40fb-ad67-500a3631be40
[I 2025-11-19 07:38:41,931] Trial 0 finished with value: 0.7873435215786674 and parameters: {'learning_rate': 0.06782702595324105, 'n_estimators': 489, 'min_child_samples': 16, 'reg_lambda': 0.00036403153644268217, 'max_depth': 5}. Best is trial 0 with value: 0.7873435215786674.
[I 2025-11-19 07:38:43,708] Trial 1 finished with value: 0.6722697175379763 and parameters: {'learning_rate': 0.07001337287112132, 'n_estimators': 160, 'min_child_samples': 194, 'reg_lambda': 0.012771888694350567, 'max_depth': 3}. Best is trial 0 with value: 0.7873435215786674.
[I 2025-11-19 07:38:54,101] Trial 2 finished with value: 0.7304373829357019 and parameters: {'learning_rate': 0.05130849879987645, 'n_estimators': 278, 'min_child_samples': 119, 'reg_lambda': 3.265597784938213, 'max_depth': 12}. Best is trial 0 with value: 0.7873435215786674.
[I 2025-11-19 07:39:01,021] Trial 3 finished with 

In [15]:
#extract the best hyperparameter
best_params = study.best_params
best_params

{'learning_rate': 0.08627215249940226,
 'n_estimators': 325,
 'min_child_samples': 18,
 'reg_lambda': 1.1448085011120879,
 'max_depth': 11}

In [16]:
best_model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 1.1,  # L2 regularization
    learning_rate= 0.08,
    max_depth= 11,
    n_estimators=325,
    min_child_samples=18
)

In [18]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,11
,learning_rate,0.08
,n_estimators,325
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [19]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)



In [20]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

0.8501142214190732

In [21]:
report_train = classification_report(y_train_cleaned,y_train_pred)
print(report_train)

              precision    recall  f1-score   support

          -1       0.82      0.76      0.79      6601
           0       0.78      0.97      0.86     10134
           1       0.95      0.80      0.87     12594

    accuracy                           0.85     29329
   macro avg       0.85      0.84      0.84     29329
weighted avg       0.86      0.85      0.85     29329



In [22]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)



In [23]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

0.7927178508114006

In [24]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

              precision    recall  f1-score   support

          -1       0.70      0.65      0.67      1647
           0       0.75      0.96      0.84      2510
           1       0.89      0.74      0.81      3176

    accuracy                           0.79      7333
   macro avg       0.78      0.78      0.77      7333
weighted avg       0.80      0.79      0.79      7333

