In [None]:
import os
import glob
import spacy
import random
import warnings
from matplotlib import pyplot as plt
from collections import Counter
import pandas as pd
import seaborn as sns

# set the random seed for reproducibility
random.seed(123)
# turn off depreciation warnings and future warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# load spacy model
nlp = spacy.load('en_core_web_lg')

# function to load data
def load_data(base_dir):
    data = []
    labels = []
    files = []
    for label in ['positive', 'negative']:
        for filepath in glob.glob(os.path.join(base_dir, label, '*.txt')):
            with open(filepath, 'r', encoding='utf-8') as file:
                data.append(file.read())
                labels.append(1 if label == 'positive' else 0)
                files.append(filepath)
                
    return data, labels, files

# delete the contents after "What I've decided and why"
def clean_data(data):
    cleaned_data = []
    for text in data:
        cleaned_data.append(text.split("What I've decided and why")[0])
    return cleaned_data


## Vfidf vectorizer and Logistic Regression

In [None]:
# preprocess the texts
def preprocess_texts(texts):
    docs = [nlp(text) for text in texts]
    return docs

# function to remove stopwords and punctuation
def remove_stopwords_punctuation(docs):
    cleaned_docs = []
    for doc in docs:
        doc = [token for token in doc if not token.is_stop and not token.is_punct]
        doc = [token for token in doc if token.text not in ['\n', 'Mr', 'Mrs', 'Miss', 'Ms']]
        doc = [token for token in doc if len(token.text) > 1]
        cleaned_docs.append(doc)
    return cleaned_docs

#  lowercase and lemmatise the tokens
def lowercase_and_lemmatise(docs):
    lemmatised_docs = []
    for doc in docs:
        lemmatised_tokens = [token.lemma_.lower() for token in doc]
        lemmatised_docs.append(lemmatised_tokens)
    return lemmatised_docs

# join the tokens back together
def join_tokens(docs):
    return [' '.join(doc) for doc in docs]

# load training data
train_data, train_labels, train_files = load_data('data/train')
# print the number of training samples
print(f'Number of training samples: {len(train_data)}')
# load test data
test_data, test_labels, test_files = load_data('data/test')
# print the number of test samples
print(f'Number of test samples: {len(test_data)}')

train_data = clean_data(train_data)
test_data = clean_data(test_data)

# preprocess the training data
train_data = preprocess_texts(train_data)
# preprocess the test data
test_data = preprocess_texts(test_data)

# remove stopwords and punctuation from the training data
train_data = remove_stopwords_punctuation(train_data)
# remove stopwords and punctuation from the test data
test_data = remove_stopwords_punctuation(test_data)

# lowercase and lemmatise the training data
train_data = lowercase_and_lemmatise(train_data)
# lowercase and lemmatise the test data
test_data = lowercase_and_lemmatise(test_data)

# join the tokens back together for the training data
train_data = join_tokens(train_data)
# join the tokens back together for the test data
test_data = join_tokens(test_data)

In [None]:
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import confusion_matrix

# Evaluate model and return metrics
def evaluate_model(model, test_data, test_labels, data_type='test'):
    predictions = model.predict(test_data)
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    conf_matrix = confusion_matrix(test_labels, predictions)
    
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix
    }

    print(f'{data_type} data metrics:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'Confusion Matrix: \n{conf_matrix}')

    return results


## TfidfVectorizer and Logistic Regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# create a pipeline with TfidfVectorizer and LogisticRegression
logreg_text_clf_pipeline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression(max_iter=1000, random_state=123)
)

# define parameter grid for GridSearchCV
param_grid_logreg = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.9, 0.95],
    'tfidfvectorizer__min_df': [2, 5],
    'logisticregression__C': [0.1, 1, 10]
}

# perform GridSearchCV
grid_search_logreg = GridSearchCV(logreg_text_clf_pipeline, param_grid_logreg, cv=5, n_jobs=-1, verbose=1)
grid_search_logreg.fit(train_data, train_labels)

# best model
best_logreg_model = grid_search_logreg.best_estimator_

# print the best parameters
print(grid_search_logreg.best_params_)

# evaluate the best model on the training data
train_metrics_logreg = evaluate_model(best_logreg_model, train_data, train_labels, data_type='train')

# evaluate the best model on the test data
test_metrics_logreg = evaluate_model(best_logreg_model, test_data, test_labels, data_type='test')

## Vfidf vectorizer and SVC

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Create a pipeline with TfidfVectorizer and SVC
svm_text_clf_pipeline = make_pipeline(
    TfidfVectorizer(),
    SVC(probability=True, random_state=123)
)

# Define parameter grid for GridSearchCV
param_grid_svc = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.9, 0.95],
    'tfidfvectorizer__min_df': [2, 5],
    'svc__C': [0.1, 1, 10]
}

# Perform GridSearchCV
grid_search_svc = GridSearchCV(svm_text_clf_pipeline, param_grid_svc, cv=5, n_jobs=-1, verbose=1)
grid_search_svc.fit(train_data, train_labels)

# Best model
best_svc_model = grid_search_svc.best_estimator_

# Print the best parameters
print(grid_search_svc.best_params_)

# Get model metrics
train_metrics_svc = evaluate_model(best_svc_model, train_data, train_labels, data_type='Train')
print('-'*50)
test_metrics_svc = evaluate_model(best_svc_model, test_data, test_labels, data_type='Test')


## Vfidf vectorizer and RandomForest

In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Create a pipeline with TfidfVectorizer and RandomForestClassifier
rf_text_clf_pipeline = make_pipeline(
    TfidfVectorizer(decode_error='ignore'),
    RandomForestClassifier(n_jobs=-1, random_state=123)
)

# Define parameter grid for GridSearchCV
param_grid_rf = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidfvectorizer__max_df': [0.9, 0.95],
    'tfidfvectorizer__min_df': [2, 5],
}

# Perform GridSearchCV
grid_search_rf = GridSearchCV(rf_text_clf_pipeline, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(train_data, train_labels)

# Best model
best_rf_model = grid_search_rf.best_estimator_

# Print the best parameters
print(grid_search_rf.best_params_)
# Get model metrics
train_metrics_rf = evaluate_model(best_rf_model, train_data, train_labels, data_type='Train')
print('-'*50)
test_metrics_rf = evaluate_model(best_rf_model, test_data, test_labels, data_type='Test')


## ROC curve

In [None]:
# create a function to plot the roc curve

from sklearn.metrics import roc_curve, auc

# function returns fpr, tpr and auc
def get_roc_curve(model, test_data, test_labels):
    probabilities = model.predict_proba(test_data)[:, 1]
    fpr, tpr, _ = roc_curve(test_labels, probabilities)
    auc_score = auc(fpr, tpr)
    return fpr, tpr, auc_score

# plot the roc curve
def plot_roc_curve(models, model_names, test_data, test_labels, data_type='test'):
    plt.figure(figsize=(10, 6))
    for model, model_name in zip(models, model_names):
        fpr, tpr, auc_score = get_roc_curve(model, test_data, test_labels)
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    # ratio set to 'equal' to ensure the aspect ratio is equal
    plt.gca().set_aspect('equal', adjustable='box')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve ({data_type} data)')
    plt.legend()
    # save the plot
    if not os.path.exists('plots'):
        os.makedirs('plots')
    plt.savefig(f'plots/roc_curve_{data_type}.png', dpi=300)
    plt.show()

# plot the roc curve
models = [best_logreg_model, best_svc_model, best_rf_model]
model_names = ['Logistic Regression', 'SVC', 'Random Forest']
plot_roc_curve(models, model_names, train_data, train_labels, data_type='train')
plot_roc_curve(models, model_names, test_data, test_labels, data_type='test')


In [None]:
## Feature Importance
# Get the feature names from the TfidfVectorizer
feature_names = best_rf_model.named_steps['tfidfvectorizer'].get_feature_names_out()

# Get the feature importances from the RandomForestClassifier
feature_importances = best_rf_model.named_steps['randomforestclassifier'].feature_importances_

# Create a DataFrame with the feature names and feature importances
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

# Get the top 20 features
top_20_features = feature_importance_df.sort_values('importance', ascending=False).head(20)

# Plot the top 20 features
def plot_feature_importance(feature_df, title):
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=feature_df, palette='viridis')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(title)
    plt.tight_layout()
    # save the plot
    if not os.path.exists('plots'):
        os.makedirs('plots')
    plt.savefig(f'plots/{title}.png', dpi=300)
    plt.show()

# plot the top 20 features
plot_feature_importance(top_20_features, 'Top 20 Features - Random Forest')