In [2]:
!rm -rf ISE-solution
# 克隆整个仓库到Colab环境
!git clone https://github.com/ideas-labo/ISE-solution

# 查看克隆下来的lab2文件夹结构，以确认克隆成功
import os
lab1_path = '/content/ISE-solution/lab1'
if os.path.exists(lab1_path):
    print("lab1文件夹克隆成功，文件夹内容如下：")
    for root, dirs, files in os.walk(lab1_path):
        level = root.replace(lab1_path, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        sub_indent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(sub_indent, f))
else:
    print("lab1文件夹克隆失败，请检查网络或仓库地址。")

Cloning into 'ISE-solution'...
remote: Enumerating objects: 210, done.[K
remote: Counting objects: 100% (210/210), done.[K
remote: Compressing objects: 100% (209/209), done.[K
remote: Total 210 (delta 88), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (210/210), 14.72 MiB | 4.76 MiB/s, done.
Resolving deltas: 100% (88/88), done.
lab1文件夹克隆成功，文件夹内容如下：
lab1/
    br_classification.py
    README.md
    datasets/
        caffe.csv
        tensorflow.csv
        keras.csv
        incubator-mxnet.csv
        pytorch.csv
Cloning into 'ISE-solution'...
remote: Enumerating objects: 210, done.[K
remote: Counting objects: 100% (210/210), done.[K
remote: Compressing objects: 100% (209/209), done.[K
remote: Total 210 (delta 88), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (210/210), 14.72 MiB | 4.73 MiB/s, done.
Resolving deltas: 100% (88/88), done.
lab1文件夹克隆成功，文件夹内容如下：
lab1/
    br_classification.py
    README.md
    datasets/
        caffe.csv

In [None]:
########## 1. Import required libraries ##########

import pandas as pd
import numpy as np
import re
import math

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

########## 2. Define text preprocessing methods ##########

def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Stopwords
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

def remove_stopwords(text):
    """Remove stopwords from the text."""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])

def clean_str(string):
    """
    Clean text by removing non-alphanumeric characters,
    and convert it to lowercase.
    """
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

########## 3. Download & read data ##########
import os
import subprocess
# Choose the project (options: 'pytorch', 'tensorflow', 'keras', 'incubator-mxnet', 'caffe')
project = 'caffe'
path = '/content/ISE-solution/lab1/datasets/caffe.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 30

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)

    # Convert sparse matrices to dense arrays
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()

    # --- 4.3 Naive Bayes model & GridSearch ---
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(X_train_dense, y_train)  # Use dense array here

    # Retrieve the best model
    best_clf = grid.best_estimator_
    best_clf.fit(X_train_dense, y_train)  # Also use dense array here

    # --- 4.4 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test_dense)  # Use dense array for prediction

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    # Precision (macro)
    prec = precision_score(y_test, y_pred, average='macro')
    precisions.append(prec)

    # Recall (macro)
    rec = recall_score(y_test, y_pred, average='macro')
    recalls.append(rec)

    # F1 Score (macro)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    # AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)
    auc_values.append(auc_val)

# --- 4.5 Aggregate results ---
final_accuracy  = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall    = np.mean(recalls)
final_f1        = np.mean(f1_scores)
final_auc       = np.mean(auc_values)

print("=== Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# Save final results to CSV (append mode)
try:
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [final_accuracy],
        'Precision': [final_precision],
        'Recall': [final_recall],
        'F1': [final_f1],
        'AUC': [final_auc],
        'CV_list(AUC)': [str(auc_values)]
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")

In [None]:
pip install lightgbm

In [None]:
pip install imbalanced-learn

In [None]:
pip install numpy

In [None]:
pip install xgboost

In [None]:
pip install numpy gensim

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Ensure nltk resources are available
nltk.download('stopwords')
nltk.download('wordnet')

# Text cleaning function definitions
def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
# Configuration parameters
input_path = '/content/ISE-solution/lab1/datasets/caffe.csv'
output_dir = '/content/ISE-solution/lab1/results'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'caffe_improved.csv')
# Advanced text preprocessing
def advanced_cleaning(text):
    lemmatizer = WordNetLemmatizer()
    text = remove_html(text)
    text = remove_emoji(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in final_stop_words_list]
    return ' '.join(words)
# Load data
df = pd.read_csv(input_path)
df = df.sample(frac=1, random_state=42)
df['combined_text'] = df['Title'] + '. ' + df['Body'].fillna('')
# Data preprocessing
final_stop_words_list = stopwords.words('english') + ['...', 'caffe', 'model', 'layer']
df['clean_text'] = df['combined_text'].apply(advanced_cleaning)
# Split dataset
X = df['clean_text']
y = df['class']
# Define models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': [
            {
                'penalty': ['l2'],
                'C': [0.1, 1, 10],
                'class_weight': ['balanced', None]
            },
            {
                'penalty': [None],
                'class_weight': ['balanced', None]
            }
        ]
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200],
            'max_depth': [3, 5]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 1, 10],
            'fit_prior': [True, False]
        }
    }
}
# Results container
results = defaultdict(list)
# Multiple experiment configuration
REPEAT = 30
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for model_name, config in models.items():
    # Initialize temporary metric lists for each model
    acc_scores = []
    prec_scores = []
    rec_scores = []
    f1_scores = []
    auc_scores = []
    cv_scores = []
    for _ in range(REPEAT):
        # Data splitting
        train_idx, test_idx = next(kf.split(X))
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        # Feature engineering
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=5000,
            stop_words=final_stop_words_list
        )
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)
        # Handle class imbalance
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train_tfidf, y_train)
        # Grid search
        grid = GridSearchCV(
            config['model'],
            config['params'],
            cv=3,
            scoring='roc_auc',
            n_jobs=-1
        )
        grid.fit(X_train_res, y_train_res)
        # Best model prediction
        best_clf = grid.best_estimator_
        y_pred = best_clf.predict(X_test_tfidf)
        y_proba = best_clf.predict_proba(X_test_tfidf)[:, 1]
        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        auc = roc_auc_score(y_test, y_proba)
        # Append metrics of each experiment to temporary lists
        acc_scores.append(acc)
        prec_scores.append(prec)
        rec_scores.append(rec)
        f1_scores.append(f1)
        auc_scores.append(auc)
        cv_scores.extend(grid.cv_results_['mean_test_score'])
    # Calculate the mean of each metric for each model
    results['Model'].append(model_name)
    results['repeated_times'].append(REPEAT)
    results['Accuracy'].append(np.mean(acc_scores))
    results['Precision'].append(np.mean(prec_scores))
    results['Recall'].append(np.mean(rec_scores))
    results['F1'].append(np.mean(f1_scores))
    results['AUC'].append(np.mean(auc_scores))
    results['CV_list(AUC)'].append(str(np.mean(cv_scores)))
# Save results
final_df = pd.DataFrame(results)
final_df.to_csv(output_file, index=False)
print(f"Optimized results saved to: {output_file}")
print(final_df.describe())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Optimized results saved to: /content/ISE-solution/lab1/results/caffe_improved.csv
       repeated_times  Accuracy  Precision    Recall        F1       AUC
count             5.0  5.000000   5.000000  5.000000  5.000000  5.000000
mean             30.0  0.894598   0.777645  0.646838  0.669200  0.862350
std               0.0  0.051091   0.136582  0.072807  0.092827  0.057900
min              30.0  0.810345   0.604449  0.573718  0.597222  0.778846
25%              30.0  0.896552   0.705357  0.580342  0.609690  0.855769
50%              30.0  0.904023   0.796970  0.657051  0.620915  0.855983
75%              30.0  0.913793   0.808724  0.673077  0.698858  0.881410
max              30.0  0.948276   0.972727  0.750000  0.819315  0.939744


In [11]:
########## 1. Import required libraries ##########

import pandas as pd
import numpy as np
import re
import math

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

########## 2. Define text preprocessing methods ##########

def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Stopwords
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

def remove_stopwords(text):
    """Remove stopwords from the text."""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])

def clean_str(string):
    """
    Clean text by removing non-alphanumeric characters,
    and convert it to lowercase.
    """
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

########## 3. Download & read data ##########
import os
import subprocess
# Choose the project (options: 'pytorch', 'tensorflow', 'keras', 'incubator-mxnet', 'caffe')
project = 'incubator-mxnet'
path = '/content/ISE-solution/lab1/datasets/incubator-mxnet.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 30

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)

    # Convert sparse matrices to dense arrays
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()

    # --- 4.3 Naive Bayes model & GridSearch ---
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(X_train_dense, y_train)  # Use dense array here

    # Retrieve the best model
    best_clf = grid.best_estimator_
    best_clf.fit(X_train_dense, y_train)  # Also use dense array here

    # --- 4.4 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test_dense)  # Use dense array for prediction

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    # Precision (macro)
    prec = precision_score(y_test, y_pred, average='macro')
    precisions.append(prec)

    # Recall (macro)
    rec = recall_score(y_test, y_pred, average='macro')
    recalls.append(rec)

    # F1 Score (macro)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    # AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)
    auc_values.append(auc_val)

# --- 4.5 Aggregate results ---
final_accuracy  = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall    = np.mean(recalls)
final_f1        = np.mean(f1_scores)
final_auc       = np.mean(auc_values)

print("=== Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# Save final results to CSV (append mode)
try:
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [final_accuracy],
        'Precision': [final_precision],
        'Recall': [final_recall],
        'F1': [final_f1],
        'AUC': [final_auc],
        'CV_list(AUC)': [str(auc_values)]
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Naive Bayes + TF-IDF Results ===
Number of repeats:     30
Average Accuracy:      0.5971
Average Precision:     0.6065
Average Recall:        0.7444
Average F1 score:      0.5345
Average AUC:           0.7444

Results have been saved to: ../incubator-mxnet_NB.csv


In [35]:
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from collections import defaultdict
from sklearn.preprocessing import StandardScaler

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Text cleaning function definitions
def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)


def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


# Configuration parameters
input_path = '/content/ISE-solution/lab1/datasets/incubator-mxnet.csv'
output_dir = '/content/ISE-solution/lab1/results'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'incubator-mxnet_improved.csv')


# Advanced text preprocessing
def advanced_cleaning(text):
    lemmatizer = WordNetLemmatizer()
    text = remove_html(text)
    text = remove_emoji(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in final_stop_words_list]
    return ' '.join(words)


# Load data
df = pd.read_csv(input_path)
df = df.sample(frac=1, random_state=42)
df['combined_text'] = df['Title'] + '. ' + df['Body'].fillna('')

# Data preprocessing
final_stop_words_list = stopwords.words('english') + ['...', 'incubator-mxnet', 'model', 'layer', 'incubator', 'mxnet']
df['clean_text'] = df['combined_text'].apply(advanced_cleaning)

# Split dataset
X = df['clean_text']
y = df['class']

# Define models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=5000, solver='liblinear'),
        'params': [
            {
                'penalty': ['l2'],
                'C': [0.1, 1, 10],
                'class_weight': ['balanced', None]
            }
        ]
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200],
            'max_depth': [3, 5]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 1, 10],
            'fit_prior': [True, False]
        }
    }
}

# Results container
results = defaultdict(list)

# Multiple experiment configuration
REPEAT = 30
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, config in models.items():
    # Initialize temporary metric lists for each model
    acc_scores = []
    prec_scores = []
    rec_scores = []
    f1_scores = []
    auc_scores = []
    cv_scores = []
    for _ in range(REPEAT):
        # Data splitting
        train_idx, test_idx = next(kf.split(X))
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Feature engineering
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=5000,
            stop_words=final_stop_words_list
        )
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)

        scaler = StandardScaler(with_mean=False)
        X_train_tfidf = scaler.fit_transform(X_train_tfidf)
        X_test_tfidf = scaler.transform(X_test_tfidf)

        # Handle class imbalance
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train_tfidf, y_train)

        # Randomized search
        n_iter = min(10, len(config['params']))
        grid = RandomizedSearchCV(
            config['model'],
            config['params'],
            cv=3,
            scoring='roc_auc',
            n_jobs=-1,
            n_iter=n_iter
        )
        grid.fit(X_train_res, y_train_res)

        # Best model prediction
        best_clf = grid.best_estimator_
        y_pred = best_clf.predict(X_test_tfidf)
        y_proba = best_clf.predict_proba(X_test_tfidf)[:, 1]

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro', zero_division=1)
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        auc = roc_auc_score(y_test, y_proba)

        # Append metrics of each experiment to temporary lists
        acc_scores.append(acc)
        prec_scores.append(prec)
        rec_scores.append(rec)
        f1_scores.append(f1)
        auc_scores.append(auc)
        cv_scores.extend(grid.cv_results_['mean_test_score'])

    # Calculate the mean of each metric for each model
    results['Model'].append(model_name)
    results['repeated_times'].append(REPEAT)
    results['Accuracy'].append(np.mean(acc_scores))
    results['Precision'].append(np.mean(prec_scores))
    results['Recall'].append(np.mean(rec_scores))
    results['F1'].append(np.mean(f1_scores))
    results['AUC'].append(np.mean(auc_scores))
    results['CV_list(AUC)'].append(str(np.mean(cv_scores)))

# Save results
final_df = pd.DataFrame(results)
final_df.to_csv(output_file, index=False)
print(f"Optimized results saved to: {output_file}")
print(final_df.describe())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimized results saved to: /content/ISE-solution/lab1/results/incubator-mxnet_improved.csv
       repeated_times  Accuracy  Precision    Recall        F1       AUC
count             5.0  5.000000   5.000000  5.000000  5.000000  5.000000
mean             30.0  0.871282   0.709366  0.591722  0.604785  0.821468
std               0.0  0.028403   0.115512  0.068505  0.083451  0.113870
min              30.0  0.834615   0.591682  0.551648  0.559944  0.661426
25%              30.0  0.855769   0.610544  0.554945  0.562105  0.751198
50%              30.0  0.875000   0.684479  0.559158  0.565581  0.854720
75%              30.0  0.880449   0.829162  0.580220  0.583128  0.900099
max              30.0  0.910577   0.830964  0.712637  0.753169  0.939899


In [13]:
########## 1. Import required libraries ##########

import pandas as pd
import numpy as np
import re
import math

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

########## 2. Define text preprocessing methods ##########

def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Stopwords
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

def remove_stopwords(text):
    """Remove stopwords from the text."""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])

def clean_str(string):
    """
    Clean text by removing non-alphanumeric characters,
    and convert it to lowercase.
    """
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

########## 3. Download & read data ##########
import os
import subprocess
# Choose the project (options: 'pytorch', 'tensorflow', 'keras', 'incubator-mxnet', 'caffe')
project = 'keras'
path = '/content/ISE-solution/lab1/datasets/keras.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 30

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)

    # Convert sparse matrices to dense arrays
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()

    # --- 4.3 Naive Bayes model & GridSearch ---
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(X_train_dense, y_train)  # Use dense array here

    # Retrieve the best model
    best_clf = grid.best_estimator_
    best_clf.fit(X_train_dense, y_train)  # Also use dense array here

    # --- 4.4 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test_dense)  # Use dense array for prediction

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    # Precision (macro)
    prec = precision_score(y_test, y_pred, average='macro')
    precisions.append(prec)

    # Recall (macro)
    rec = recall_score(y_test, y_pred, average='macro')
    recalls.append(rec)

    # F1 Score (macro)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    # AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)
    auc_values.append(auc_val)

# --- 4.5 Aggregate results ---
final_accuracy  = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall    = np.mean(recalls)
final_f1        = np.mean(f1_scores)
final_auc       = np.mean(auc_values)

print("=== Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# Save final results to CSV (append mode)
try:
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [final_accuracy],
        'Precision': [final_precision],
        'Recall': [final_recall],
        'F1': [final_f1],
        'AUC': [final_auc],
        'CV_list(AUC)': [str(auc_values)]
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Naive Bayes + TF-IDF Results ===
Number of repeats:     30
Average Accuracy:      0.5637
Average Precision:     0.6307
Average Recall:        0.6937
Average F1 score:      0.5450
Average AUC:           0.6937

Results have been saved to: ../keras_NB.csv


In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Ensure nltk resources are available
nltk.download('stopwords')
nltk.download('wordnet')

# Text cleaning function definitions
def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
# Configuration parameters
input_path = '/content/ISE-solution/lab1/datasets/keras.csv'
output_dir = '/content/ISE-solution/lab1/results'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'keras_improved.csv')
# Advanced text preprocessing
def advanced_cleaning(text):
    lemmatizer = WordNetLemmatizer()
    text = remove_html(text)
    text = remove_emoji(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in final_stop_words_list]
    return ' '.join(words)
# Load data
df = pd.read_csv(input_path)
df = df.sample(frac=1, random_state=42)
df['combined_text'] = df['Title'] + '. ' + df['Body'].fillna('')
# Data preprocessing
final_stop_words_list = stopwords.words('english') + ['...', 'keras', 'model', 'layer']
df['clean_text'] = df['combined_text'].apply(advanced_cleaning)
# Split dataset
X = df['clean_text']
y = df['class']
# Define models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': [
            {
                'penalty': ['l2'],
                'C': [0.1, 1, 10],
                'class_weight': ['balanced', None]
            },
            {
                'penalty': [None],
                'class_weight': ['balanced', None]
            }
        ]
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200],
            'max_depth': [3, 5]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 1, 10],
            'fit_prior': [True, False]
        }
    }
}
# Results container
results = defaultdict(list)
# Multiple experiment configuration
REPEAT = 30
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for model_name, config in models.items():
    # Initialize temporary metric lists for each model
    acc_scores = []
    prec_scores = []
    rec_scores = []
    f1_scores = []
    auc_scores = []
    cv_scores = []
    for _ in range(REPEAT):
        # Data splitting
        train_idx, test_idx = next(kf.split(X))
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        # Feature engineering
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=5000,
            stop_words=final_stop_words_list
        )
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)
        # Handle class imbalance
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train_tfidf, y_train)
        # Grid search
        grid = GridSearchCV(
            config['model'],
            config['params'],
            cv=3,
            scoring='roc_auc',
            n_jobs=-1
        )
        grid.fit(X_train_res, y_train_res)
        # Best model prediction
        best_clf = grid.best_estimator_
        y_pred = best_clf.predict(X_test_tfidf)
        y_proba = best_clf.predict_proba(X_test_tfidf)[:, 1]
        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        auc = roc_auc_score(y_test, y_proba)
        # Append metrics of each experiment to temporary lists
        acc_scores.append(acc)
        prec_scores.append(prec)
        rec_scores.append(rec)
        f1_scores.append(f1)
        auc_scores.append(auc)
        cv_scores.extend(grid.cv_results_['mean_test_score'])
    # Calculate the mean of each metric for each model
    results['Model'].append(model_name)
    results['repeated_times'].append(REPEAT)
    results['Accuracy'].append(np.mean(acc_scores))
    results['Precision'].append(np.mean(prec_scores))
    results['Recall'].append(np.mean(rec_scores))
    results['F1'].append(np.mean(f1_scores))
    results['AUC'].append(np.mean(auc_scores))
    results['CV_list(AUC)'].append(str(np.mean(cv_scores)))
# Save results
final_df = pd.DataFrame(results)
final_df.to_csv(output_file, index=False)
print(f"Optimized results saved to: {output_file}")
print(final_df.describe())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimized results saved to: /content/ISE-solution/lab1/results/keras_improved.csv
       repeated_times  Accuracy  Precision    Recall        F1       AUC
count             5.0  5.000000   5.000000  5.000000  5.000000  5.000000
mean             30.0  0.864677   0.762286  0.740301  0.737490  0.880725
std               0.0  0.050138   0.065727  0.057573  0.050655  0.061202
min              30.0  0.776119   0.652961  0.685365  0.669951  0.773684
25%              30.0  0.877363   0.770127  0.711842  0.714752  0.892865
50%              30.0  0.880597   0.771473  0.723684  0.743295  0.903070
75%              30.0  0.893781   0.786949  0.744737  0.751589  0.905906
max              30.0  0.895522   0.829918  0.835877  0.807865  0.928099


In [15]:
########## 1. Import required libraries ##########

import pandas as pd
import numpy as np
import re
import math

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

########## 2. Define text preprocessing methods ##########

def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Stopwords
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

def remove_stopwords(text):
    """Remove stopwords from the text."""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])

def clean_str(string):
    """
    Clean text by removing non-alphanumeric characters,
    and convert it to lowercase.
    """
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

########## 3. Download & read data ##########
import os
import subprocess
# Choose the project (options: 'pytorch', 'tensorflow', 'keras', 'incubator-mxnet', 'caffe')
project = 'pytorch'
path = '/content/ISE-solution/lab1/datasets/pytorch.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 30

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)

    # Convert sparse matrices to dense arrays
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()

    # --- 4.3 Naive Bayes model & GridSearch ---
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(X_train_dense, y_train)  # Use dense array here

    # Retrieve the best model
    best_clf = grid.best_estimator_
    best_clf.fit(X_train_dense, y_train)  # Also use dense array here

    # --- 4.4 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test_dense)  # Use dense array for prediction

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    # Precision (macro)
    prec = precision_score(y_test, y_pred, average='macro')
    precisions.append(prec)

    # Recall (macro)
    rec = recall_score(y_test, y_pred, average='macro')
    recalls.append(rec)

    # F1 Score (macro)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    # AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)
    auc_values.append(auc_val)

# --- 4.5 Aggregate results ---
final_accuracy  = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall    = np.mean(recalls)
final_f1        = np.mean(f1_scores)
final_auc       = np.mean(auc_values)

print("=== Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# Save final results to CSV (append mode)
try:
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [final_accuracy],
        'Precision': [final_precision],
        'Recall': [final_recall],
        'F1': [final_f1],
        'AUC': [final_auc],
        'CV_list(AUC)': [str(auc_values)]
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Naive Bayes + TF-IDF Results ===
Number of repeats:     30
Average Accuracy:      0.6402
Average Precision:     0.6120
Average Recall:        0.7542
Average F1 score:      0.5651
Average AUC:           0.7542

Results have been saved to: ../pytorch_NB.csv


In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# Ensure nltk resources are available
nltk.download('stopwords')
nltk.download('wordnet')

# Text cleaning function definitions
def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
# Configuration parameters
input_path = '/content/ISE-solution/lab1/datasets/pytorch.csv'
output_dir = '/content/ISE-solution/lab1/results'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'pytorch_improved.csv')
# Advanced text preprocessing
def advanced_cleaning(text):
    lemmatizer = WordNetLemmatizer()
    text = remove_html(text)
    text = remove_emoji(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in final_stop_words_list]
    return ' '.join(words)
# Load data
df = pd.read_csv(input_path)
df = df.sample(frac=1, random_state=42)
df['combined_text'] = df['Title'] + '. ' + df['Body'].fillna('')
# Data preprocessing
final_stop_words_list = stopwords.words('english') + ['...', 'pytorch', 'model', 'layer']
df['clean_text'] = df['combined_text'].apply(advanced_cleaning)
# Split dataset
X = df['clean_text']
y = df['class']
# Define models and parameter grids
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': [
            {
                'penalty': ['l2'],
                'C': [0.1, 1, 10],
                'class_weight': ['balanced', None]
            },
            {
                'penalty': [None],
                'class_weight': ['balanced', None]
            }
        ]
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200],
            'max_depth': [3, 5]
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 1, 10],
            'fit_prior': [True, False]
        }
    }
}
# Results container
results = defaultdict(list)
# Multiple experiment configuration
REPEAT = 30
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for model_name, config in models.items():
    # Initialize temporary metric lists for each model
    acc_scores = []
    prec_scores = []
    rec_scores = []
    f1_scores = []
    auc_scores = []
    cv_scores = []
    for _ in range(REPEAT):
        # Data splitting
        train_idx, test_idx = next(kf.split(X))
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        # Feature engineering
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=5000,
            stop_words=final_stop_words_list
        )
        X_train_tfidf = tfidf.fit_transform(X_train)
        X_test_tfidf = tfidf.transform(X_test)
        # Handle class imbalance
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train_tfidf, y_train)
        # Grid search
        grid = GridSearchCV(
            config['model'],
            config['params'],
            cv=3,
            scoring='roc_auc',
            n_jobs=-1
        )
        grid.fit(X_train_res, y_train_res)
        # Best model prediction
        best_clf = grid.best_estimator_
        y_pred = best_clf.predict(X_test_tfidf)
        y_proba = best_clf.predict_proba(X_test_tfidf)[:, 1]
        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        auc = roc_auc_score(y_test, y_proba)
        # Append metrics of each experiment to temporary lists
        acc_scores.append(acc)
        prec_scores.append(prec)
        rec_scores.append(rec)
        f1_scores.append(f1)
        auc_scores.append(auc)
        cv_scores.extend(grid.cv_results_['mean_test_score'])
    # Calculate the mean of each metric for each model
    results['Model'].append(model_name)
    results['repeated_times'].append(REPEAT)
    results['Accuracy'].append(np.mean(acc_scores))
    results['Precision'].append(np.mean(prec_scores))
    results['Recall'].append(np.mean(rec_scores))
    results['F1'].append(np.mean(f1_scores))
    results['AUC'].append(np.mean(auc_scores))
    results['CV_list(AUC)'].append(str(np.mean(cv_scores)))
# Save results
final_df = pd.DataFrame(results)
final_df.to_csv(output_file, index=False)
print(f"Optimized results saved to: {output_file}")
print(final_df.describe())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Optimized results saved to: /content/ISE-solution/lab1/results/pytorch_improved.csv
       repeated_times  Accuracy  Precision    Recall        F1       AUC
count             5.0  5.000000   5.000000  5.000000  5.000000  5.000000
mean             30.0  0.889316   0.821026  0.740117  0.759274  0.890857
std               0.0  0.018818   0.039055  0.086815  0.070314  0.027407
min              30.0  0.862914   0.775867  0.610991  0.640859  0.860367
25%              30.0  0.880795   0.787139  0.710001  0.748117  0.869488
50%              30.0  0.888300   0.825319  0.759022  0.798282  0.887489
75%              30.0  0.907285   0.851295  0.775919  0.802700  0.912073
max              30.0  0.907285   0.865509  0.844652  0.806410  0.924869


In [None]:
########## 1. Import required libraries ##########

import pandas as pd
import numpy as np
import re
import math

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation and tuning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_curve, auc)

# Classifier
from sklearn.naive_bayes import GaussianNB

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

########## 2. Define text preprocessing methods ##########

def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Stopwords
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']  # You can customize this list as needed
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

def remove_stopwords(text):
    """Remove stopwords from the text."""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])

def clean_str(string):
    """
    Clean text by removing non-alphanumeric characters,
    and convert it to lowercase.
    """
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

########## 3. Download & read data ##########
import os
import subprocess
# Choose the project (options: 'pytorch', 'tensorflow', 'keras', 'incubator-mxnet', 'caffe')
project = 'tensorflow'
path = '/content/ISE-solution/lab1/datasets/tensorflow.csv'

pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body into a single column; if Body is NaN, use Title only
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

# Keep only necessary columns: id, Number, sentiment, text (merged Title+Body)
pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id", "Number", "sentiment", "text"])

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========

# 1) Data file to read
datafile = 'Title+Body.csv'

# 2) Number of repeated experiments
REPEAT = 30

# 3) Output CSV file name
out_csv_name = f'../{project}_NB.csv'

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Keep a copy for referencing original data if needed
original_data = data.copy()

# Text cleaning
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)

# ========== Hyperparameter grid ==========
# We use logspace for var_smoothing: [1e-12, 1e-11, ..., 1]
params = {
    'var_smoothing': np.logspace(-12, 0, 13)
}

# Lists to store metrics across repeated runs
accuracies  = []
precisions  = []
recalls     = []
f1_scores   = []
auc_values  = []

for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test  = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=1000  # Adjust as needed
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)

    # Convert sparse matrices to dense arrays
    X_train_dense = X_train.toarray()
    X_test_dense = X_test.toarray()

    # --- 4.3 Naive Bayes model & GridSearch ---
    clf = GaussianNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,              # 5-fold CV (can be changed)
        scoring='roc_auc'  # Using roc_auc as the metric for selection
    )
    grid.fit(X_train_dense, y_train)  # Use dense array here

    # Retrieve the best model
    best_clf = grid.best_estimator_
    best_clf.fit(X_train_dense, y_train)  # Also use dense array here

    # --- 4.4 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test_dense)  # Use dense array for prediction

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

    # Precision (macro)
    prec = precision_score(y_test, y_pred, average='macro')
    precisions.append(prec)

    # Recall (macro)
    rec = recall_score(y_test, y_pred, average='macro')
    recalls.append(rec)

    # F1 Score (macro)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_scores.append(f1)

    # AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)
    auc_values.append(auc_val)

# --- 4.5 Aggregate results ---
final_accuracy  = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall    = np.mean(recalls)
final_f1        = np.mean(f1_scores)
final_auc       = np.mean(auc_values)

print("=== Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# Save final results to CSV (append mode)
try:
    existing_data = pd.read_csv(out_csv_name, nrows=1)
    header_needed = False
except:
    header_needed = True

df_log = pd.DataFrame(
    {
        'repeated_times': [REPEAT],
        'Accuracy': [final_accuracy],
        'Precision': [final_precision],
        'Recall': [final_recall],
        'F1': [final_f1],
        'AUC': [final_auc],
        'CV_list(AUC)': [str(auc_values)]
    }
)

df_log.to_csv(out_csv_name, mode='a', header=header_needed, index=False)

print(f"\nResults have been saved to: {out_csv_name}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Naive Bayes + TF-IDF Results ===
Number of repeats:     30
Average Accuracy:      0.5551
Average Precision:     0.6349
Average Recall:        0.7106
Average F1 score:      0.5362
Average AUC:           0.7106

Results have been saved to: ../tensorflow_NB.csv


In [36]:

# Text and feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

# Text cleaning & stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Data balancing
from imblearn.over_sampling import SMOTE

########## 2. Define text preprocessing methods ##########
stemmer = PorterStemmer()

def remove_html(text):
    """Remove HTML tags using a regex."""
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    """Remove emojis using a regex pattern."""
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"  # enclosed characters
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

NLTK_stop_words_list = stopwords.words('english')

custom_stop_words_list = ['...', 'tensorflow', 'tensor', 'model', 'layer', 'function']
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list

def remove_stopwords(text):
    """Remove stopwords and technical terms from the text."""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])

def clean_str(string):
    """
    Clean text by removing non-alphanumeric characters,
    handling special cases, and converting to lowercase.
    """
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\r\n|\n|\r", " ", string)  # Handle newlines
    string = re.sub(r"\s{2,}", " ", string)       # Collapse multiple spaces
    string = re.sub(r"\\", "", string)            # Remove backslashes
    string = re.sub(r"\'", "", string)            # Remove apostrophes
    string = re.sub(r"\"", "", string)            # Remove quotes
    return string.strip().lower()

########## 3. Download & read data ##########
project = 'tensorflow'
input_path = '/content/ISE-solution/lab1/datasets/tensorflow.csv'
output_dir = '/content/ISE-solution/lab1/results'

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

pd_all = pd.read_csv(input_path)
pd_all = pd_all.sample(frac=1, random_state=999)  # Shuffle

# Merge Title and Body
pd_all['Title+Body'] = pd_all.apply(
    lambda row: row['Title'] + '. ' + row['Body'] if pd.notna(row['Body']) else row['Title'],
    axis=1
)

pd_tplusb = pd_all.rename(columns={
    "Unnamed: 0": "id",
    "class": "sentiment",
    "Title+Body": "text"
})

########## 4. Configure parameters & Start training ##########

# ========== Key Configurations ==========
datafile = 'Title+Body.csv'
REPEAT = 30
out_csv_name = os.path.join(output_dir, f'{project}_improved.csv')

# ========== Read and clean data ==========
data = pd.read_csv(datafile).fillna('')
text_col = 'text'

# Text cleaning pipeline
data[text_col] = data[text_col].apply(remove_html)
data[text_col] = data[text_col].apply(remove_emoji)
data[text_col] = data[text_col].apply(remove_stopwords)
data[text_col] = data[text_col].apply(clean_str)
data[text_col] = data[text_col].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))  # Stemming

# ========== Hyperparameter grid ==========
params = {
    'alpha': [0.01, 0.1, 1, 10],
    'fit_prior': [True, False]
}

# Lists to store metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_values = []

for repeated_time in range(REPEAT):
    # --- 4.1 Split into train/test ---
    indices = np.arange(data.shape[0])
    train_index, test_index = train_test_split(
        indices, test_size=0.2, random_state=repeated_time
    )

    train_text = data[text_col].iloc[train_index]
    test_text = data[text_col].iloc[test_index]

    y_train = data['sentiment'].iloc[train_index]
    y_test = data['sentiment'].iloc[test_index]

    # --- 4.2 TF-IDF vectorization ---
    tfidf = TfidfVectorizer(
        ngram_range=(1, 3),
        max_features=2000,
        min_df=2,
        stop_words=final_stop_words_list
    )
    X_train = tfidf.fit_transform(train_text)
    X_test = tfidf.transform(test_text)

    # --- 4.3 Data balancing with SMOTE ---
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train.toarray(), y_train)

    # --- 4.4 Naive Bayes model & GridSearch ---
    clf = MultinomialNB()
    grid = GridSearchCV(
        clf,
        params,
        cv=5,
        scoring='f1_macro'
    )
    grid.fit(X_train_res, y_train_res)

    best_clf = grid.best_estimator_
    best_clf.fit(X_train_res, y_train_res)

    # --- 4.5 Make predictions & evaluate ---
    y_pred = best_clf.predict(X_test.toarray())

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    auc_val = auc(fpr, tpr)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)
    auc_values.append(auc_val)

# --- 4.6 Aggregate results ---
final_accuracy = np.mean(accuracies)
final_precision = np.mean(precisions)
final_recall = np.mean(recalls)
final_f1 = np.mean(f1_scores)
final_auc = np.mean(auc_values)

print("=== Improved Naive Bayes + TF-IDF Results ===")
print(f"Number of repeats:     {REPEAT}")
print(f"Average Accuracy:      {final_accuracy:.4f}")
print(f"Average Precision:     {final_precision:.4f}")
print(f"Average Recall:        {final_recall:.4f}")
print(f"Average F1 score:      {final_f1:.4f}")
print(f"Average AUC:           {final_auc:.4f}")

# --- 4.7 Save results to CSV ---
df_log = pd.DataFrame({
    'repeated_times': [REPEAT],
    'Accuracy': [final_accuracy],
    'Precision': [final_precision],
    'Recall': [final_recall],
    'F1': [final_f1],
    'AUC': [final_auc],
    'CV_list(AUC)': [str(auc_values)]
})

df_log.to_csv(out_csv_name, mode='a', header=not os.path.exists(out_csv_name), index=False)
print(f"\nResults saved to: {out_csv_name}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Improved Naive Bayes + TF-IDF Results ===
Number of repeats:     30
Average Accuracy:      0.8364
Average Precision:     0.6548
Average Recall:        0.6924
Average F1 score:      0.6656
Average AUC:           0.6924

Results saved to: /content/ISE-solution/lab1/results/tensorflow_improved.csv
