# Traditional machine learning-based approaches

## Naive Bayes

In [None]:
# Install library that can handle class imbalance
!pip install imbalanced-learn



In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE

# Function to set all seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

# Setting the seed
set_seed(42)

# Function to load a single dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df['text'], df['labels']

# Replace the numerical labels with the sentiment categories
def map_labels(label):
    if label == 0:
        return "negative"
    elif label == 1:
        return "neutral"
    elif label == 2:
        return "positive"
    else:
        return "unknown"

# Function to perform sentiment analysis and generate a classification reportand confusion matrix
def cross_val_analysis(X_train_val, y_train_val, X_test, y_test):
    tfidf = TfidfVectorizer()
    X_train_val_tfidf = tfidf.fit_transform(X_train_val)
    X_test_tfidf = tfidf.transform(X_test)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_val_tfidf, y_train_val)

    nb_classifier = MultinomialNB()

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(nb_classifier, X_resampled, y_resampled, cv=skf)

    # Cross-validation classification report and confusion matrix
    report_cv = classification_report(y_resampled, y_pred_cv, zero_division=0)
    cm_cv = confusion_matrix(y_resampled, y_pred_cv, labels=["negative", "neutral", "positive"])

    # Train final model on the entire training+validation set and test on the unseen test set
    nb_classifier.fit(X_resampled, y_resampled)
    y_pred_test = nb_classifier.predict(X_test_tfidf)

    # Test set classification report and confusion matrix
    report_test = classification_report(y_test, y_pred_test, zero_division=0)
    cm_test = confusion_matrix(y_test, y_pred_test, labels=["negative", "neutral", "positive"])

    return report_cv, cm_cv, report_test, cm_test

# List of datasets paths
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv", "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    print(f"Processing {dataset_name}...")

    # Load dataset
    X, y = load_dataset(dataset_path)

    # Map numerical labels to sentiment categories for ground truth
    y = y.apply(map_labels)

    # Split the dataset into 85% training+validation and 15% test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    # Further split the training+validation set so that in the end there is a 70% training and 15% validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val)

    # Combine training and validation sets for cross-validation
    X_train_val_combined = pd.concat([X_train, X_val])
    y_train_val_combined = pd.concat([y_train, y_val])

    # Perform cross-validation analysis
    report_cv, cm_cv, report_test, cm_test = cross_val_analysis(X_train_val_combined, y_train_val_combined, X_test, y_test)

    # Print cross-validation classification report and confusion matrix
    print(f"Cross-Validation Classification Report for {dataset_name}:\n", report_cv)
    print(f"Cross-Validation Confusion Matrix for {dataset_name}:\n", cm_cv)
    print("-" * 50)

    # Print test set classification report and confusion matrix
    print(f"Test Set Classification Report for {dataset_name}:\n", report_test)
    print(f"Test Set Confusion Matrix for {dataset_name}:\n", cm_test)
    print("=" * 50)


Processing 1960s_gas...
Cross-Validation Classification Report for 1960s_gas:
               precision    recall  f1-score   support

    negative       0.67      0.97      0.79       187
     neutral       0.76      0.67      0.71       187
    positive       0.79      0.53      0.64       187

    accuracy                           0.72       561
   macro avg       0.74      0.72      0.71       561
weighted avg       0.74      0.72      0.71       561

Cross-Validation Confusion Matrix for 1960s_gas:
 [[181   3   3]
 [ 38 125  24]
 [ 51  36 100]]
--------------------------------------------------
Test Set Classification Report for 1960s_gas:
               precision    recall  f1-score   support

    negative       0.28      0.67      0.39        12
     neutral       0.36      0.20      0.26        20
    positive       0.52      0.39      0.45        33

    accuracy                           0.38        65
   macro avg       0.39      0.42      0.37        65
weighted avg       0

## Support Vector Machine

In [None]:
# Install library that can handle class imbalance
!pip install scikit-learn imbalanced-learn



In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from imblearn.over_sampling import SMOTE

# Function to set all seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

# Setting the seed
set_seed(42)

# Function to load a single dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df['text'], df['labels']

# Replace the numerical labels with the sentiment categories
def map_labels(label):
    if label == 0:
        return "negative"
    elif label == 1:
        return "neutral"
    elif label == 2:
        return "positive"
    else:
        return "unknown"

# Function to perform sentiment analysis and generate classification report and confusion matrix
def cross_val_analysis(X_train_val, y_train_val, X_test, y_test):
    tfidf = TfidfVectorizer()
    X_train_val_tfidf = tfidf.fit_transform(X_train_val)
    X_test_tfidf = tfidf.transform(X_test)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_val_tfidf, y_train_val)

    svm_classifier = SVC(kernel='linear', random_state=42)  # Using linear kernel as an example, you can change it

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_pred_cv = cross_val_predict(svm_classifier, X_resampled, y_resampled, cv=skf)

    # Cross-validation classification report and confusion matrix
    report_cv = classification_report(y_resampled, y_pred_cv, zero_division=0)
    cm_cv = confusion_matrix(y_resampled, y_pred_cv, labels=["negative", "neutral", "positive"])

    # Train final model on the entire training+validation set and test on the unseen test set
    svm_classifier.fit(X_resampled, y_resampled)
    y_pred_test = svm_classifier.predict(X_test_tfidf)

    # Test set classification report and confusion matrix
    report_test = classification_report(y_test, y_pred_test, zero_division=0)
    cm_test = confusion_matrix(y_test, y_pred_test, labels=["negative", "neutral", "positive"])

    return report_cv, cm_cv, report_test, cm_test

# List of datasets paths
dataset_paths = ["1960s_gas.csv", "1970s_gas.csv", "1980s_gas.csv", "1990s_gas.csv"]

# Iterate over each dataset path in the list
for dataset_path in dataset_paths:
    dataset_name = dataset_path.split(".")[0]
    print(f"Processing {dataset_name}...")

    # Load dataset
    X, y = load_dataset(dataset_path)

    # Map numerical labels to sentiment categories for ground truth
    y = y.apply(map_labels)

    # Split the dataset into 85% training+validation and 15% test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    # Further split the training+validation set so that in the end there is 70% training and 15% validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42, stratify=y_train_val)

    # Combine training and validation sets for cross-validation
    X_train_val_combined = pd.concat([X_train, X_val])
    y_train_val_combined = pd.concat([y_train, y_val])

    # Perform cross-validation analysis
    report_cv, cm_cv, report_test, cm_test = cross_val_analysis(X_train_val_combined, y_train_val_combined, X_test, y_test)

    # Print cross-validation classification report and confusion matrix
    print(f"Cross-Validation Classification Report for {dataset_name}:\n", report_cv)
    print(f"Cross-Validation Confusion Matrix for {dataset_name}:\n", cm_cv)
    print("-" * 50)

    # Print test set classification report and confusion matrix
    print(f"Test Set Classification Report for {dataset_name}:\n", report_test)
    print(f"Test Set Confusion Matrix for {dataset_name}:\n", cm_test)
    print("=" * 50)


Processing 1960s_gas...
Cross-Validation Classification Report for 1960s_gas:
               precision    recall  f1-score   support

    negative       0.89      0.91      0.90       187
     neutral       0.78      0.71      0.74       187
    positive       0.70      0.74      0.72       187

    accuracy                           0.79       561
   macro avg       0.79      0.79      0.79       561
weighted avg       0.79      0.79      0.79       561

Cross-Validation Confusion Matrix for 1960s_gas:
 [[171   2  14]
 [  9 132  46]
 [ 12  36 139]]
--------------------------------------------------
Test Set Classification Report for 1960s_gas:
               precision    recall  f1-score   support

    negative       0.33      0.33      0.33        12
     neutral       0.18      0.10      0.13        20
    positive       0.52      0.67      0.59        33

    accuracy                           0.43        65
   macro avg       0.35      0.37      0.35        65
weighted avg       0