# Imports

In [3]:
import re

import glob
import numpy as np
import csv
import sklearn.feature_extraction
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from scipy import sparse



# Bernoulli NB

In [3]:
class BernoulliNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_log_prior_ = None
        self.feature_log_prob_ = None
        self.classes_ = None

    def fit(self, X, y):
        unique_y = np.unique(y)
        self.classes_ = unique_y
        count_sample = X.shape[0]
        self.class_log_prior_ = np.log([np.sum(y == yi) / count_sample for yi in unique_y])

        self.feature_log_prob_ = np.array([
         np.log((X[y == yi].sum(axis=0) + self.alpha)
                (np.sum(y == yi) + 2 * self.alpha))
            for yi in unique_y
        ])
        return self


    def predict(self, X):
        # Convert to dense array if X is a sparse matrix
        if sparse.issparse(X):
            X = X.toarray()

        # Calculate log probabilities for each class
        # Ensure that the operations are performed correctly on matrices/arrays
        log_probs = X.dot(self.feature_log_prob_.T) + self.class_log_prior_[np.newaxis, :]

        # Select the class with the highest log probability
        return self.classes_[np.argmax(log_probs, axis=1)]


# Multinomial NB

In [4]:
class MultinomialNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_log_prior_ = None
        self.feature_log_prob_ = None
        self.classes_ = None

    def fit(self, X, y):
        unique_y = np.unique(y)
        self.classes_ = unique_y
        count_sample = X.shape[0]
        self.class_log_prior_ = np.log([np.sum(y == yi) / count_sample for yi in unique_y])

        self.feature_log_prob_ = np.array([
            np.log((X[y == yi].sum(axis=0) + self.alpha) / (X[y == yi].sum() + self.alpha * X.shape[1]))
            for yi in unique_y
        ])
        return self

    def predict(self, X):
        return np.array([self.classes_[np.argmax(
            self.class_log_prior_ + X_test.dot(self.feature_log_prob_.T))
                         ] for X_test in X])

    def predict_proba(self, X):
        return np.array([(self.class_log_prior_ + X_test.dot(self.feature_log_prob_.T)).exp()
                         for X_test in X])


# Gaussian NB

In [5]:
class GaussianNaiveBayes(BaseEstimator, ClassifierMixin):
    def __init__(self, epsilon=1e-9):
        self.epsilon = epsilon
        self.means_ = None
        self.variances_ = None
        self.class_prior_ = None
        self.classes_ = None

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        n_features = X.shape[1]
        n_classes = len(self.classes_)

        self.means_ = np.zeros((n_classes, n_features))
        self.variances_ = np.zeros((n_classes, n_features))
        self.class_prior_ = np.zeros(n_classes)

        for idx, c in enumerate(self.classes_):
            X_c = X[y == c]
            self.means_[idx, :] = X_c.mean(axis=0)
            # Add epsilon for variance smoothing
            self.variances_[idx, :] = X_c.var(axis=0) + self.epsilon
            self.class_prior_[idx] = X_c.shape[0] / float(X.shape[0])
        return self

    def _calculate_log_likelihood(self, X, class_idx):
        mean = self.means_[class_idx]
        variance = self.variances_[class_idx]
        exponent = np.exp(-(X - mean) ** 2 / (2 * variance))
        return np.sum(-0.5 * np.log(2 * np.pi * variance) - exponent / (2 * variance), axis=1)

    def predict(self, X):
        log_likelihoods = np.array([self._calculate_log_likelihood(X, i) for i in range(len(self.classes_))]).T
        log_prior = np.log(self.class_prior_)
        log_posterior = log_likelihoods + log_prior
        return self.classes_[np.argmax(log_posterior, axis=1)]



# English Dataset

In [6]:
'''
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}

@data{yfwt-wr77-20,
doi = {10.21227/yfwt-wr77},
url = {https://dx.doi.org/10.21227/yfwt-wr77},
author = {Tan, Songbo},
publisher = {IEEE Dataport},
title = {ChnSentiCorp},
year = {2020} 
}

@online{allocine_dataset,
  title        = {Allocine Dataset},
  author       = {{Théophile Blard}},
  year         = {Year the dataset was accessed},
  url          = {https://github.com/TheophileBlard/french-sentiment-analysis-with-bert/blob/master/allocine_dataset/data.tar.bz2},
  note         = {Accessed: Date you accessed the dataset},
}
@misc{stopwords_zh_2023,
  author = {{stopwords-iso}},
  title = {Chinese Stopwords List},
  year = {2023},
  url = {https://github.com/stopwords-iso/stopwords-zh/blob/master/stopwords-zh.txt},
  note = {GitHub repository}
}

'''

'\n@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n\n@data{yfwt-wr77-20,\ndoi = {10.21227/yfwt-wr77},\nurl = {https://dx.doi.org/10.21227/yfwt-wr77},\nauthor = {Tan, Songbo},\npublisher = {IEEE Dataport},\ntitle = {ChnSentiCorp},\nyear = {2020} \n}\n\n@online{allocine_dataset,\n  title        = {Allocine Dataset},\n  author       = {{Théophile Blard}},\n  year         = {Year the dataset was accessed},\n  url          = {https:/

In [7]:
train_df = pd.read_csv("data/english/aclImdb/train.tsv", delimiter='\t', encoding='utf-8')
test_df = pd.read_csv("data/english/aclImdb/train.tsv", delimiter='\t', encoding='utf-8')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

train_df['text_a'] = train_df['text_a'].apply(clean_text)
test_df['text_a'] = test_df['text_a'].apply(clean_text)

## English Bernoulli NB

In [8]:

from sklearn.naive_bayes import BernoulliNB

vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(train_df['text_a'])
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'alpha': [0.1, 0.5, 1, 2, 5]}
grid_search = GridSearchCV(BernoulliNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

best_bernoulli_model = grid_search.best_estimator_

y_val_pred = best_bernoulli_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")


X_test = vectorizer.transform(test_df['text_a'])
y_test = test_df['label']
y_test_pred = best_bernoulli_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")

Best Parameters: {'alpha': 0.5}
Best Cross-Validated Score: 0.8452
Validation Set Accuracy: 0.8472
Validation Set F1 Score: 0.8394957983193277
Test Set Accuracy: 0.90324
Test Set F1 Score: 0.8992796768955323


## English Multinomial NB

In [9]:

from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer(binary=False)  
X = vectorizer.fit_transform(train_df['text_a'])
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'alpha': [0.1, 0.5, 1, 2, 5]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

best_multinomial_model = grid_search.best_estimator_
y_val_pred = best_multinomial_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")


X_test = vectorizer.transform(test_df['text_a'])
y_test = test_df['label']
y_test_pred = best_multinomial_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Best Parameters: {'alpha': 5}
Best Cross-Validated Score: 0.8572
Validation Set Accuracy: 0.8566
Validation Set F1 Score: 0.8532842234499692
Test Set Accuracy: 0.88872
Test Set F1 Score: 0.8866987048953328


## English Gaussian NB

In [10]:

from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_df['text_a']).toarray()
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = make_pipeline(StandardScaler(), GaussianNB())
model.fit(X_train, y_train)


y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")



X_test = vectorizer.transform(test_df['text_a']).toarray() 
y_test = test_df['label']
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Validation Set Accuracy: 0.6374
Validation Set F1 Score: 0.5751113194281696
Test Set Accuracy: 0.83456
Test Set F1 Score: 0.8118289353958145


# French Dataset

In [11]:
train_df = pd.read_csv("data/french/data/train.tsv", delimiter='\t', encoding='utf-8')
test_df = pd.read_csv("data/french/data/train.tsv", delimiter='\t', encoding='utf-8')
val_df = pd.read_csv("data/french/data/val.tsv", delimiter='\t', encoding='utf-8')

stop_words = set(stopwords.words('french'))

def clean_text(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

train_df['text_a'] = train_df['text_a'].apply(clean_text)
test_df['text_a'] = test_df['text_a'].apply(clean_text)
val_df['text_a'] = val_df['text_a'].apply(clean_text)

## French Bernoulli NB

In [12]:
from sklearn.naive_bayes import BernoulliNB

vectorizer = CountVectorizer(binary=True)
X_train = vectorizer.fit_transform(train_df['text_a'])
y_train = train_df['label']

X_val = vectorizer.transform(val_df['text_a'])
y_val = val_df['label']

param_grid = {'alpha': [0.1, 0.5, 1, 2, 5]}
grid_search = GridSearchCV(BernoulliNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

best_bernoulli_model = grid_search.best_estimator_
y_val_pred = best_bernoulli_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")


X_test = vectorizer.transform(test_df['text_a'])
y_test = test_df['label']
y_test_pred = best_bernoulli_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Best Parameters: {'alpha': 5}
Best Cross-Validated Score: 0.858967410405097
Validation Set Accuracy: 0.866870122073244
Validation Set F1 Score: 0.8550100800958971
Test Set Accuracy: 0.8717640954470721
Test Set F1 Score: 0.864569372462285


## French Multinomial NB

In [13]:
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer(binary=False)  
X_train = vectorizer.fit_transform(train_df['text_a'])
y_train = train_df['label']

X_val = vectorizer.transform(val_df['text_a'])
y_val = val_df['label']

param_grid = {'alpha': [0.1, 0.5, 1, 2, 5]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

best_multinomial_model = grid_search.best_estimator_
y_val_pred = best_multinomial_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")


X_test = vectorizer.transform(test_df['text_a'])
y_test = test_df['label']
y_test_pred = best_multinomial_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Best Parameters: {'alpha': 5}
Best Cross-Validated Score: 0.8978201404194077
Validation Set Accuracy: 0.9021412847708625
Validation Set F1 Score: 0.8997539975399754
Test Set Accuracy: 0.911110694348068
Test Set F1 Score: 0.911869890480411


## French Gaussian NB

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['text_a']).toarray()
y_train = train_df['label']

X_val = vectorizer.transform(val_df['text_a']).toarray()
y_val = val_df['label']

model = make_pipeline(StandardScaler(), GaussianNB())
model.fit(X_train, y_train)


y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")



X_test = vectorizer.transform(test_df['text_a']).toarray() 
y_test = test_df['label']
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Validation Set Accuracy: 0.73
Validation Set F1 Score: 0.696060037523452
Test Set Accuracy: 0.7291666666666666
Test Set F1 Score: 0.7037374658158614


# Chinese Dataset

In [1]:
import jieba
import pandas as pd


train_df = pd.read_csv("data/chinese/chnsenticorp-main/train.tsv", delimiter='\t', encoding='utf-8')
test_df = pd.read_csv("data/chinese/chnsenticorp-main/test.tsv", delimiter='\t', encoding='utf-8')
val_df = pd.read_csv("data/chinese/chnsenticorp-main/dev.tsv", delimiter='\t', encoding='utf-8')

def load_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = set(file.read().splitlines())
    return stop_words

stop_words = load_stop_words('data/chinese/stopwords-zh.txt')  # Replace with your stop words list

def clean_text(text):
    words = jieba.cut(text)
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

train_df['text_a'] = train_df['text_a'].apply(clean_text)
test_df['text_a'] = test_df['text_a'].apply(clean_text)
val_df['text_a'] = val_df['text_a'].apply(clean_text)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/8y/pwclkfq17dgfr91dw_mc9flr0000gn/T/jieba.cache
Loading model cost 0.637 seconds.
Prefix dict has been built successfully.


## Chinese Bernoulli NB

In [4]:
from sklearn.naive_bayes import BernoulliNB

vectorizer = CountVectorizer(binary=True)
X_train = vectorizer.fit_transform(train_df['text_a'])
y_train = train_df['label']

X_val = vectorizer.transform(val_df['text_a'])
y_val = val_df['label']

param_grid = {'alpha': [0.1, 0.5, 1, 2, 5]}
grid_search = GridSearchCV(BernoulliNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

best_bernoulli_model = grid_search.best_estimator_
y_val_pred = best_bernoulli_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")


X_test = vectorizer.transform(test_df['text_a'])
y_test = test_df['label']
y_test_pred = best_bernoulli_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Best Parameters: {'alpha': 0.5}
Best Cross-Validated Score: 0.849769081614666
Validation Set Accuracy: 0.8466666666666667
Validation Set F1 Score: 0.8327272727272728
Test Set Accuracy: 0.865
Test Set F1 Score: 0.8617747440273037


## Chinese Multinomial NB

In [5]:
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer(binary=False)  
X_train = vectorizer.fit_transform(train_df['text_a'])
y_train = train_df['label']

X_val = vectorizer.transform(val_df['text_a'])
y_val = val_df['label']

param_grid = {'alpha': [0.1, 0.5, 1, 2, 5]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

best_multinomial_model = grid_search.best_estimator_
y_val_pred = best_multinomial_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")

X_test = vectorizer.transform(test_df['text_a'])
y_test = test_df['label']
y_test_pred = best_multinomial_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Best Parameters: {'alpha': 0.1}
Best Cross-Validated Score: 0.837742084868258
Validation Set Accuracy: 0.8475
Validation Set F1 Score: 0.8437233134073442
Test Set Accuracy: 0.8441666666666666
Test Set F1 Score: 0.8495575221238939


## Chinese Gaussian NB

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['text_a']).toarray()
y_train = train_df['label']

X_val = vectorizer.transform(val_df['text_a']).toarray()
y_val = val_df['label']

model = make_pipeline(StandardScaler(), GaussianNB())
model.fit(X_train, y_train)


y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Set Accuracy: {val_accuracy}")
print(f"Validation Set F1 Score: {val_f1}")



X_test = vectorizer.transform(test_df['text_a']).toarray() 
y_test = test_df['label']
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Accuracy: {test_accuracy}")
print(f"Test Set F1 Score: {test_f1}")


Validation Set Accuracy: 0.73
Validation Set F1 Score: 0.696060037523452
Test Set Accuracy: 0.7291666666666666
Test Set F1 Score: 0.7037374658158614
