In [3]:
!pip install scikit-learn pandas numpy nltk imblearn

import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB  # Changed from ComplementNB to MultinomialNB
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc, log_loss)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

stop_words = set(stopwords.words('english') + ['...', 'bug', 'issue', 'error'])
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word.lower() not in stop_words])

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)])

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

pd_all = pd.read_csv('/content/pytorch.csv')
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'], axis=1
)
data = pd_all.rename(columns={"Unnamed: 0": "id", "class": "sentiment", "Title+Body": "text"})
text_col = 'text'
data[text_col] = data[text_col].apply(remove_html).apply(remove_emoji).apply(remove_stopwords).apply(clean_str).apply(lemmatize_text)
data = data.fillna('')

project = 'pytorch'
REPEAT = 25
out_csv_name = f'/content/{project}_OptimizedNB_Count.csv'

accuracies, precisions, recalls, f1_scores, auc_values = [], [], [], [], []

for repeated_time in range(REPEAT):
    print(f"Running experiment {repeated_time + 1}/{REPEAT}...")
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(indices, test_size=0.2, random_state=repeated_time)

    train_texts = data[text_col].iloc[train_index]
    test_texts = data[text_col].iloc[test_index]
    y_train = data['sentiment'].iloc[train_index]
    y_test = data['sentiment'].iloc[test_index]

    vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=10000, min_df=3, max_df=0.85)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)

    smote = SMOTE(random_state=repeated_time)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    clf = MultinomialNB()  # Changed from ComplementNB() to MultinomialNB()
    param_grid = {'alpha': np.logspace(-3, 1, 20)}
    grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro')
    grid.fit(X_train_balanced, y_train_balanced)  # Removed .toarray()
    best_clf = grid.best_estimator_

    best_clf.fit(X_train_balanced, y_train_balanced)  # Removed .toarray()
    y_pred = best_clf.predict(X_test)  # Removed .toarray()
    y_pred_proba = best_clf.predict_proba(X_test)  # Removed .toarray()

    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred, average='macro'))
    recalls.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1], pos_label=1)
    auc_values.append(auc(fpr, tpr))

final_accuracy = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall = np.mean(recalls)
final_f1 = np.mean(f1_scores)
final_auc = np.mean(auc_values)

print("=== Multinomial Naive Bayes + CountVectorizer Results ===")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")


df_log = pd.DataFrame({
    'repeated_times': [REPEAT], 'Accuracy': [final_accuracy],
    'Precision': [final_precision], 'Recall': [final_recall], 'F1': [final_f1], 'AUC': [final_auc],
    'CV_list(AUC)': [str(auc_values)]
})
df_log.to_csv(out_csv_name, mode='a', header=not pd.io.common.file_exists(out_csv_name), index=False)
print(f"Results saved to: {out_csv_name}")

Running experiment 1/25...
Running experiment 2/25...
Running experiment 3/25...
Running experiment 4/25...
Running experiment 5/25...
Running experiment 6/25...
Running experiment 7/25...
Running experiment 8/25...
Running experiment 9/25...
Running experiment 10/25...
Running experiment 11/25...
Running experiment 12/25...
Running experiment 13/25...
Running experiment 14/25...
Running experiment 15/25...
Running experiment 16/25...
Running experiment 17/25...
Running experiment 18/25...
Running experiment 19/25...
Running experiment 20/25...
Running experiment 21/25...
Running experiment 22/25...
Running experiment 23/25...
Running experiment 24/25...
Running experiment 25/25...
=== Multinomial Naive Bayes + CountVectorizer Results ===
Average Accuracy:      0.8482
Average Precision:     0.6634
Average Recall:        0.6831
Average F1 score:      0.6635
Average AUC:           0.7216
Results saved to: /content/pytorch_OptimizedNB_Count.csv
