In [12]:
# Importing necessary libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
# from textpreprocessor_rev2 import TextPreprocessor
import pandas as pd
import os
import joblib

# Initialize the Text Pre Processor class
# processor = TextPreprocessor()

INPUT_DIR = "../Output/proto_models_rev2"
OUTPUT_DIR = "../Output/proto_models_rev2/NB"

# Load data
df_train = pd.read_csv(os.path.join(INPUT_DIR, 'train_cleaned.csv'))
df_test = pd.read_csv(os.path.join(INPUT_DIR, 'test_cleaned.csv'))
# df_test = processor.load_data()

y_train = df_train['polarity']
y_test = df_test['polarity']

X_train_tfidf = joblib.load(os.path.join(INPUT_DIR, 'X_train_tfidf.pkl'))
X_test_tfidf = joblib.load(os.path.join(INPUT_DIR, 'X_test_tfidf.pkl'))
# Preprocess data
# df_train = processor.preprocess(df_train)
# df_test = processor.preprocess(df_test)

# Split data
# X_train, y_train= processor.split_data(df_train)
# X_test, y_test = processor.split_data(df_test)

# X_train_tfidf, X_test_tfidf = processor.vectorize_text(X_train, X_test)

In [16]:
# Model building using Naive Bayes
model = MultinomialNB()  # Naive Bayes classifier
model.fit(X_train_tfidf, y_train)

# Making predictions
y_pred = model.predict(X_test_tfidf)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 81.95%
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79       893
           1       0.81      0.87      0.84      1107

    accuracy                           0.82      2000
   macro avg       0.82      0.81      0.82      2000
weighted avg       0.82      0.82      0.82      2000



In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Define the initial parameter grid for RandomizedSearchCV
param_dist = {
    'alpha': np.linspace(0.01, 1, 50),  # Smoothing parameter; commonly tuned in Naive Bayes
    'fit_prior': [True, False]  # Whether to learn class prior probabilities or not
}

# Step 1: RandomizedSearchCV for rough tuning
random_search = RandomizedSearchCV(
    MultinomialNB(), 
    param_distributions=param_dist, 
    n_iter=20,  # Number of random combinations to try
    scoring='accuracy', 
    cv=5,  # 5-fold cross-validation
    random_state=42,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

# Run RandomizedSearchCV
random_search.fit(X_train_tfidf, y_train)
print(f"Best parameters from RandomizedSearchCV: {random_search.best_params_}")

# Step 2: Define a narrower parameter grid for GridSearchCV based on RandomizedSearchCV results
param_grid = {
    'alpha': np.linspace(random_search.best_params_['alpha'] - 0.5, random_search.best_params_['alpha'] + 0.5, 100),  # Narrow range around best alpha
    'fit_prior': [random_search.best_params_['fit_prior']]  # Use the best fit_prior value found
}

# GridSearchCV for fine-tuning
grid_search = GridSearchCV(
    MultinomialNB(), 
    param_grid=param_grid, 
    scoring='accuracy', 
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

# Run GridSearchCV
grid_search.fit(X_train_tfidf, y_train)
print(f"Best parameters from GridSearchCV: {grid_search.best_params_}")

# Final model using the best parameters
best_model = grid_search.best_estimator_

# Making predictions
y_pred = best_model.predict(X_test_tfidf)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Tuned Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters from RandomizedSearchCV: {'fit_prior': False, 'alpha': 0.8383673469387755}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters from GridSearchCV: {'alpha': 1.1666501752216036, 'fit_prior': False}
Tuned Model Accuracy: 82.55%
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.81      0.80       893
           1       0.84      0.84      0.84      1107

    accuracy                           0.83      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.83      0.83      0.83      2000

