In [49]:
from utils import *
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import ssl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
# import xgboost as xgb

In [50]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /homes/ahf119/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /homes/ahf119/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [51]:
train_set, val_set, test_set = load_train_and_val()
test_set = test_set.fillna('')

## Preprocess Data

In [52]:
def process_text_df(data_set: pd.DataFrame) -> List[str]:
    """
    Process textual data from a given dataset by removing unwanted characters, converting to lowercase, 
    tokenizing, and removing stopwords.

    This function takes a dataset containing text data, cleans and preprocesses the text by removing 
    non-alphabetic characters, converting all text to lowercase, tokenizing the text into words, and 
    removing common English stopwords (e.g., 'the', 'a', 'we', etc.). The cleaned and processed text 
    is then rejoined into strings and returned as a list of preprocessed text entries.

    Parameters:
    - data_set (pd.DataFrame): A DataFrame where the column 'text' contains the textual data to be processed.

    Returns:
    - List[str]: A list of strings where each string is the preprocessed text corresponding to each entry in the input dataset.

    Note:
    - This function requires the 're' (Regular Expression), 'nltk.tokenize' (for word_tokenize), 
      and 'nltk.corpus' (for stopwords) modules. Ensure these are imported before calling the function.
    - The function assumes that the input DataFrame has a column named 'text' containing the text to process.
    - Stopwords are removed based on NLTK's English stopwords list. This list can be customized if different 
      stopwords are preferred.
    """
    preprocessed: List[str] = []
    for i in range(len(data_set['text'])):
        # Remove unwanted characters, and make lower case
        processed_string: str = re.sub('[^A-Za-z]', ' ', data_set['text'].iloc[i]).lower()

        # Split string into tokens
        word_tokenized: List[str] = word_tokenize(processed_string)

        # Remove stop words (the, we, a etc)
        filtered_words: List[str] = [word for word in word_tokenized if word not in stopwords.words('english')]

        # Joining the words together into a string
        processed_string = " ".join(filtered_words)
        preprocessed.append(processed_string)

    return preprocessed

In [53]:
preprocessed_train = process_text_df(train_set)
preprocessed_val = process_text_df(val_set)
preprocessed_test = process_text_df(test_set)

## Count words and do Naive Bayes

In [58]:
matrix = CountVectorizer(max_features=1000)

X_train = matrix.fit_transform(preprocessed_train).toarray()
y_train = train_set['bin_label']

X_val = matrix.transform(preprocessed_val).toarray()
y_val = val_set['bin_label']

X_test = matrix.transform(preprocessed_test).toarray()
y_test = test_set['bin_label']

In [59]:
classifier_NaiveBayes = GaussianNB()
classifier_NaiveBayes.fit(X_train, y_train)

In [60]:
y_pred = classifier_NaiveBayes.predict(X_val)
y_pred_test = classifier_NaiveBayes.predict(X_test)
y_pred_train = classifier_NaiveBayes.predict(X_train)

In [61]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.4338109860116001
Train F1 score is: 0.24344654661499887
Val Accuracy is: 0.3963390370075607
Val F1 score is: 0.21683014971605577
Test Accuracy is: 0.37679083094555876
Test F1 score is: 0.19494139420111042


## Do TF-IDF preprocessing and NB

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
tfidf_vectorizer = TfidfVectorizer()
X_train_Tfidf = tfidf_vectorizer.fit_transform(preprocessed_train).toarray()
X_val_Tfidf = tfidf_vectorizer.transform(preprocessed_val).toarray()
X_test_Tfidf = tfidf_vectorizer.transform(preprocessed_test).toarray()

In [64]:
classifier_NaiveBayes = GaussianNB()
classifier_NaiveBayes.fit(X_train_Tfidf, y_train)

In [65]:
y_pred = classifier_NaiveBayes.predict(X_val_Tfidf)
y_pred_test = classifier_NaiveBayes.predict(X_test_Tfidf)
y_pred_train = classifier_NaiveBayes.predict(X_train_Tfidf)

In [66]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.9820880245649949
Train F1 score is: 0.910941475826972
Val Accuracy is: 0.8643056108237167
Val F1 score is: 0.11886304909560723
Test Accuracy is: 0.8643744030563515
Test F1 score is: 0.08387096774193549


## Logistic Regression Counter

In [67]:
from sklearn.linear_model import LogisticRegression

In [68]:
classifier_logistic_reg = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = classifier_logistic_reg.predict(X_val)
y_pred_train = classifier_logistic_reg.predict(X_train)
y_pred_test = classifier_logistic_reg.predict(X_test)

In [69]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.9379051518253156
Train F1 score is: 0.5260416666666666
Val Accuracy is: 0.8937524870672503
Val F1 score is: 0.2562674094707521
Test Accuracy is: 0.8901623686723973
Test F1 score is: 0.21768707482993196


## Logistic Regression TF IDF

In [70]:
classifier_logistic_reg = LogisticRegression(random_state=0).fit(X_train_Tfidf, y_train)
y_pred = classifier_logistic_reg.predict(X_val_Tfidf)
y_pred_train = classifier_logistic_reg.predict(X_train_Tfidf)
y_pred_test = classifier_NaiveBayes.predict(X_test_Tfidf)

In [71]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.9097577618560219
Train F1 score is: 0.029357798165137616
Val Accuracy is: 0.8985276561878233
Val F1 score is: 0.015444015444015444
Test Accuracy is: 0.8643744030563515
Test F1 score is: 0.08387096774193549


# Utilising Augmented data

## Counter Naive Bayes

In [23]:
augmented_data = load_augmented()
preprocessed_train_aug = process_text_df(augmented_data)

inserted True
subbed True
back_translated True
deleted True
swapped True


In [24]:
# Contaneate original data and augmented data 
preprocessed_train_aug_and_non_aug =  preprocessed_train + preprocessed_train_aug
y_train_aug_and_non_aug = np.concatenate([train_set['bin_label'].to_numpy(), augmented_data['bin_label'].to_numpy()])

In [79]:
matrix = CountVectorizer(max_features=1000)

X_train = matrix.fit_transform(preprocessed_train_aug_and_non_aug).toarray()
y_train_aug_and_non_aug = np.concatenate([train_set['bin_label'].to_numpy(), augmented_data['bin_label'].to_numpy()])

X_val = matrix.transform(preprocessed_val).toarray()
y_val = val_set['bin_label']

X_test = matrix.transform(preprocessed_test).toarray()
y_test = test_set['bin_label']

In [80]:
classifier_NaiveBayes = GaussianNB()
classifier_NaiveBayes.fit(X_train, y_train_aug_and_non_aug)

In [81]:
y_pred = classifier_NaiveBayes.predict(X_val)
y_pred_train = classifier_NaiveBayes.predict(X_train)
y_pred_test = classifier_NaiveBayes.predict(X_test)

In [82]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.7633899052237161
Train F1 score is: 0.7242840631822268
Val Accuracy is: 0.7353760445682451
Val F1 score is: 0.3137254901960784
Test Accuracy is: 0.7320916905444126
Test F1 score is: 0.3013698630136986


## TF IDF Naive Bayes

In [83]:
tfidf_vectorizer = TfidfVectorizer()
X_train_Tfidf = tfidf_vectorizer.fit_transform(preprocessed_train_aug_and_non_aug).toarray()
X_val_Tfidf = tfidf_vectorizer.transform(preprocessed_val).toarray()
X_test_Tfidf = tfidf_vectorizer.transform(preprocessed_test).toarray()

In [84]:
classifier_NaiveBayes = GaussianNB()
classifier_NaiveBayes.fit(X_train_Tfidf, y_train_aug_and_non_aug)

In [85]:
y_pred = classifier_NaiveBayes.predict(X_val_Tfidf)
y_pred_train = classifier_NaiveBayes.predict(X_train_Tfidf)
y_pred_test = classifier_NaiveBayes.predict(X_test_Tfidf)

In [86]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.9657262508265374
Train F1 score is: 0.9601741580227943
Val Accuracy is: 0.7863111818543573
Val F1 score is: 0.15165876777251186
Test Accuracy is: 0.7874880611270296
Test F1 score is: 0.16822429906542055


## Counter Logistic Regression

In [87]:
classifier_logistic_reg = LogisticRegression(random_state=0).fit(X_train, y_train_aug_and_non_aug)
y_pred = classifier_logistic_reg.predict(X_val)
y_pred_train = classifier_logistic_reg.predict(X_train)
y_pred_test = classifier_logistic_reg.predict(X_test)

In [88]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.8603702887370509
Train F1 score is: 0.8273136159193131
Val Accuracy is: 0.8070035813768405
Val F1 score is: 0.3178621659634318
Test Accuracy is: 0.7994269340974212
Test F1 score is: 0.2953020134228188


## Logistic Regression TF IDF

In [89]:
classifier_logistic_reg = LogisticRegression(random_state=0).fit(X_train_Tfidf, y_train_aug_and_non_aug)
y_pred = classifier_logistic_reg.predict(X_val_Tfidf)

np.save('predictions/val_logistic_regression.npy', y_pred)

y_pred_train = classifier_logistic_reg.predict(X_train_Tfidf)
y_pred_test = classifier_logistic_reg.predict(X_test_Tfidf)

In [90]:
print("Train Accuracy is: {}".format(accuracy_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))
print("Train F1 score is: {}".format(f1_score(y_true=y_train_aug_and_non_aug, y_pred=y_pred_train)))

print("Val Accuracy is: {}".format(accuracy_score(y_true=y_val, y_pred=y_pred)))
print("Val F1 score is: {}".format(f1_score(y_true=y_val, y_pred=y_pred)))

print("Test Accuracy is: {}".format(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print("Test F1 score is: {}".format(f1_score(y_true=y_test, y_pred=y_pred_test)))

Train Accuracy is: 0.9473220189552568
Train F1 score is: 0.9348773841961853
Val Accuracy is: 0.8742538798249104
Val F1 score is: 0.40823970037453183
Test Accuracy is: 0.8638968481375359
Test F1 score is: 0.35079726651480636
