In [1]:
import os
import glob
import spacy
import random
import warnings
from matplotlib import pyplot as plt
from collections import Counter
import pandas as pd
import seaborn as sns

# set the random seed for reproducibility
random.seed(123)
# turn off depreciation warnings and future warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# load spacy model
nlp = spacy.load('en_core_web_lg')

# function to load data
def load_data(base_dir):
    data = []
    labels = []
    files = []
    for label in ['positive', 'negative']:
        for filepath in glob.glob(os.path.join(base_dir, label, '*.txt')):
            with open(filepath, 'r', encoding='utf-8') as file:
                data.append(file.read())
                labels.append(1 if label == 'positive' else 0)
                files.append(filepath)
                
    return data, labels, files

# delete the contents after "What I've decided and why"
def clean_data(data):
    cleaned_data = []
    for text in data:
        cleaned_data.append(text.split("What I've decided and why")[0])
    return cleaned_data


## Vfidf vectorizer and Logistic Regression

In [2]:
# preprocess the texts
def preprocess_texts(texts):
    docs = [nlp(text) for text in texts]
    return docs

# function to remove stopwords and punctuation
def remove_stopwords_punctuation(docs):
    cleaned_docs = []
    for doc in docs:
        doc = [token for token in doc if not token.is_stop and not token.is_punct]
        doc = [token for token in doc if token.text not in ['\n', 'Mr', 'Mrs', 'Miss', 'Ms']]
        doc = [token for token in doc if len(token.text) > 1]
        cleaned_docs.append(doc)
    return cleaned_docs

#  lowercase and lemmatise the tokens
def lowercase_and_lemmatise(docs):
    lemmatised_docs = []
    for doc in docs:
        lemmatised_tokens = [token.lemma_.lower() for token in doc]
        lemmatised_docs.append(lemmatised_tokens)
    return lemmatised_docs

# join the tokens back together
def join_tokens(docs):
    return [' '.join(doc) for doc in docs]

# load training data
train_data, train_labels, train_files = load_data('data/train')
# load test data
test_data, test_labels, test_files = load_data('data/test')

train_data = clean_data(train_data)
test_data = clean_data(test_data)

# preprocess the training data
train_data = preprocess_texts(train_data)
# preprocess the test data
test_data = preprocess_texts(test_data)

# remove stopwords and punctuation from the training data
train_data = remove_stopwords_punctuation(train_data)
# remove stopwords and punctuation from the test data
test_data = remove_stopwords_punctuation(test_data)

# lowercase and lemmatise the training data
train_data = lowercase_and_lemmatise(train_data)
# lowercase and lemmatise the test data
test_data = lowercase_and_lemmatise(test_data)

# join the tokens back together for the training data
train_data = join_tokens(train_data)
# join the tokens back together for the test data
test_data = join_tokens(test_data)

In [3]:
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import confusion_matrix

# Evaluate model and return metrics
def evaluate_model(model, test_data, test_labels, data_type='test'):
    predictions = model.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    conf_matrix = confusion_matrix(test_labels, predictions)
    
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix
    }

    print(f'{data_type} data metrics:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Confusion Matrix: \n{conf_matrix}')

    return results


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# create a pipeline with TfidfVectorizer and LogisticRegression
logreg_text_clf_pipeline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000)
)

# define parameter grid for GridSearchCV
param_grid_logreg = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.9, 0.95],
    'tfidfvectorizer__min_df': [2, 5],
    'logisticregression__C': [0.1, 1, 10]
}

# perform GridSearchCV
grid_search_logreg = GridSearchCV(logreg_text_clf_pipeline, param_grid_logreg, cv=5, n_jobs=-1, verbose=1)
grid_search_logreg.fit(train_data, train_labels)

# best model
best_logreg_model = grid_search_logreg.best_estimator_

# print the best parameters
print(grid_search_logreg.best_params_)

# evaluate the best model on the training data
train_metrics_logreg = evaluate_model(best_logreg_model, train_data, train_labels, data_type='train')

# evaluate the best model on the test data
test_metrics_logreg = evaluate_model(best_logreg_model, test_data, test_labels, data_type='test')

Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'logisticregression__C': 10, 'tfidfvectorizer__max_df': 0.9, 'tfidfvectorizer__min_df': 5, 'tfidfvectorizer__ngram_range': (1, 3)}
train data metrics:
Accuracy: 0.9951
Precision: 0.9927
Recall: 0.9975
F1 Score: 0.9951
Confusion Matrix: 
[[409   3]
 [  1 406]]
test data metrics:
Accuracy: 0.7415
Precision: 0.7526
Recall: 0.7157
F1 Score: 0.7337
Confusion Matrix: 
[[79 24]
 [29 73]]


In [5]:
# import matplotlib.pyplot as plt
# import numpy as np

# # function to plot the most important unigrams and bigrams
# def plot_top_coefficients(model, train_data, top_n=20):
#     # fit the vectorizer to the training data to get feature names
#     vectorizer = model.named_steps['tfidfvectorizer']
#     X_train_transformed = vectorizer.fit_transform(train_data)

#     # get the logistic regression model coefficients
#     log_reg = model.named_steps['logisticregression']
#     coefficients = log_reg.coef_.flatten()

#     # get feature names (unigrams and bigrams)
#     feature_names = vectorizer.get_feature_names_out()

#     # get the top positive and negative features (highest and lowest coefficients)
#     top_positive_coefficients = np.argsort(coefficients)[-top_n:]
#     top_negative_coefficients = np.argsort(coefficients)[:top_n]

#     # plot the most important unigrams and bigrams
#     top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
#     plt.figure(figsize=(15, 5))
#     colors = ['lightcoral' if c < 0 else 'lightblue' for c in coefficients[top_coefficients]]
#     plt.bar(np.arange(2 * top_n), coefficients[top_coefficients], color=colors)
#     feature_names = np.array(feature_names)
#     plt.xticks(np.arange(2 * top_n), feature_names[top_coefficients], rotation=60, ha='right')
#     plt.title(f'top {top_n//2} positive and negative unigrams and bigrams')
#     plt.show()

# # plot the top coefficients using the best model and training data
# plot_top_coefficients(best_logreg_model, train_data, top_n=20)


In [6]:
## Vfidf vectorizer and SVC

In [7]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Create a pipeline with TfidfVectorizer and SVC
svm_text_clf_pipeline = make_pipeline(
    TfidfVectorizer(),
    SVC()
)

# Define parameter grid for GridSearchCV
param_grid_svc = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.9, 0.95],
    'tfidfvectorizer__min_df': [2, 5],
    'svc__C': [0.1, 1, 10]
}

# Perform GridSearchCV
grid_search_svc = GridSearchCV(svm_text_clf_pipeline, param_grid_svc, cv=5, n_jobs=-1, verbose=1)
grid_search_svc.fit(train_data, train_labels)

# Best model
best_svc_model = grid_search_svc.best_estimator_

# Print the best parameters
print(grid_search_svc.best_params_)

# Get model metrics
train_metrics_svc = evaluate_model(best_svc_model, train_data, train_labels, data_type='Train')
print('-'*50)
test_metrics_svc = evaluate_model(best_svc_model, test_data, test_labels, data_type='Test')


Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'svc__C': 10, 'tfidfvectorizer__max_df': 0.9, 'tfidfvectorizer__min_df': 5, 'tfidfvectorizer__ngram_range': (1, 3)}
Train data metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix: 
[[412   0]
 [  0 407]]
--------------------------------------------------
Test data metrics:
Accuracy: 0.7415
Precision: 0.7634
Recall: 0.6961
F1 Score: 0.7282
Confusion Matrix: 
[[81 22]
 [31 71]]


In [9]:
## Vfidf vectorizer and RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Create a pipeline with TfidfVectorizer and RandomForestClassifier
rf_text_clf_pipeline = make_pipeline(
    TfidfVectorizer(),
    RandomForestClassifier()
)

# Define parameter grid for GridSearchCV
param_grid_rf = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.9, 0.95],
    'tfidfvectorizer__min_df': [2, 5],
    'randomforestclassifier__n_estimators': [100, 200, 300]
}

# Perform GridSearchCV
grid_search_rf = GridSearchCV(rf_text_clf_pipeline, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(train_data, train_labels)

# Best model
best_rf_model = grid_search_rf.best_estimator_

# Print the best parameters
print(grid_search_rf.best_params_)
# Get model metrics
train_metrics_rf = evaluate_model(best_rf_model, train_data, train_labels, data_type='Train')
print('-'*50)
test_metrics_rf = evaluate_model(best_rf_model, test_data, test_labels, data_type='Test')


Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'randomforestclassifier__n_estimators': 100, 'tfidfvectorizer__max_df': 0.95, 'tfidfvectorizer__min_df': 5, 'tfidfvectorizer__ngram_range': (1, 3)}
Train data metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Confusion Matrix: 
[[412   0]
 [  0 407]]
--------------------------------------------------
Test data metrics:
Accuracy: 0.7171
Precision: 0.7157
Recall: 0.7157
F1 Score: 0.7157
Confusion Matrix: 
[[74 29]
 [29 73]]
