<a href="https://colab.research.google.com/github/GeraintWong/f20aa_cw2/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
train_df = pd.read_csv('train_processed.csv')

In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [3]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def tokenize_text(text):
    return word_tokenize(str(text))

train_df['Text_Processed_Tokens'] = train_df['Text_Processed'].apply(tokenize_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
# Used to pass into N-gram and Vector Space
X = train_df['Text_Processed'].tolist()
y = [score - 1 for score in train_df['Score'].tolist()]

# Used to pass into word embeddings
X_token = train_df['Text_Processed_Tokens']

# Splitting the dataset to 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers1 = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Naïve Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier()
}

param_grids = {
    "Logistic Regression": {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear']
    },
    "Decision Tree": {
        'classifier__max_depth': [5, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],  # Minimum samples required at each leaf node
        'classifier__criterion': ['gini', 'entropy']  # The function to measure quality of a split
    },
    "Naïve Bayes": {
        'classifier__priors': [None, [0.5, 0.5]],  # Example for prior probabilities
        'classifier__var_smoothing': [1e-9, 1e-8]  # Tuning the variance smoothing parameter
    }
}

results = []

In [8]:
import gensim.downloader as api
word2vec = api.load("word2vec-google-news-300")
glove_vectors = api.load("glove-twitter-25")


In [14]:
# Function to convert text to word embeddings
def get_embedding(tokens, model):
    """
    Generates an average word embedding for a list of tokens.
    """
    word_vectors = [model[word] for word in tokens if word in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Convert text data to embeddings
X_word2vec = np.array([get_embedding(text, word2vec) for text in X_token])
X_glove = np.array([get_embedding(text, glove_vectors) for text in X_token])

subset_fraction = 0.1  # 10% of the data for experimentation

# Train-test split for word embeddings
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(X_glove, y, test_size=0.2, random_state=42)

# Selecting the subset of data (10% of the training data)
X_train_w2v_subset = X_train_w2v[:int(subset_fraction * len(X_train_w2v))]
y_train_w2v_subset = y_train_w2v[:int(subset_fraction * len(y_train_w2v))]

X_train_glove_subset = X_train_glove[:int(subset_fraction * len(X_train_glove))]
y_train_glove_subset = y_train_glove[:int(subset_fraction * len(y_train_glove))]

# Results container
results = []

# Test classifiers on word embeddings
for clf_name, clf in classifiers1.items():
    print(f"\n=== {clf_name} ===")

    # Word2Vec
    pipeline_w2v = Pipeline([
        ('classifier', clf)
    ])
    param_grid_w2v = param_grids.get(clf_name, {})

    if param_grid_w2v:
        search_w2v = RandomizedSearchCV(
            pipeline_w2v,
            param_distributions=param_grid_w2v,
            n_iter=5,
            cv=3,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        search_w2v.fit(X_train_w2v_subset, y_train_w2v_subset)  # Use the subset for fitting
        best_model_w2v = search_w2v.best_estimator_
        print(f"Best Parameters for Word2Vec: {search_w2v.best_params_}")  # Show best hyperparameters
    else:
        pipeline_w2v.fit(X_train_w2v_subset, y_train_w2v_subset)  # Use the subset for fitting
        best_model_w2v = pipeline_w2v

    y_pred_w2v = best_model_w2v.predict(X_test_w2v)
    accuracy_w2v = accuracy_score(y_test_w2v, y_pred_w2v)
    precision_w2v = precision_score(y_test_w2v, y_pred_w2v, average='weighted', zero_division=1)
    recall_w2v = recall_score(y_test_w2v, y_pred_w2v, average='weighted', zero_division=1)
    f1_w2v = f1_score(y_test_w2v, y_pred_w2v, average='weighted', zero_division=1)

    print("Word2Vec")
    print(classification_report(y_test_w2v, y_pred_w2v, zero_division=1))

    results.append(["Word Embedding", clf_name, "Word2Vec", accuracy_w2v, precision_w2v, recall_w2v, f1_w2v])

    # GloVe
    pipeline_glove = Pipeline([
        ('classifier', clf)
    ])
    param_grid_glove = param_grids.get(clf_name, {})

    if param_grid_glove:
        search_glove = RandomizedSearchCV(
            pipeline_glove,
            param_distributions=param_grid_glove,
            n_iter=5,
            cv=3,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        search_glove.fit(X_train_glove_subset, y_train_glove_subset)  # Use the subset for fitting
        best_model_glove = search_glove.best_estimator_
        print(f"Best Parameters for GloVe: {search_glove.best_params_}")  # Show best hyperparameters
    else:
        pipeline_glove.fit(X_train_glove_subset, y_train_glove_subset)  # Use the subset for fitting
        best_model_glove = pipeline_glove

    y_pred_glove = best_model_glove.predict(X_test_glove)
    accuracy_glove = accuracy_score(y_test_glove, y_pred_glove)
    precision_glove = precision_score(y_test_glove, y_pred_glove, average='weighted', zero_division=1)
    recall_glove = recall_score(y_test_glove, y_pred_glove, average='weighted', zero_division=1)
    f1_glove = f1_score(y_test_glove, y_pred_glove, average='weighted', zero_division=1)

    print("GloVe")
    print(classification_report(y_test_glove, y_pred_glove, zero_division=1))

    results.append(["Word Embedding", clf_name, "GloVe", accuracy_glove, precision_glove, recall_glove, f1_glove])



=== Logistic Regression ===




Best Parameters for Word2Vec: {'classifier__solver': 'liblinear', 'classifier__C': 10}
Word2Vec
              precision    recall  f1-score   support

           0       0.57      0.45      0.50      5644
           1       0.27      0.03      0.05      3214
           2       0.33      0.08      0.13      4679
           3       0.35      0.09      0.15      8688
           4       0.71      0.97      0.82     39602

    accuracy                           0.68     61827
   macro avg       0.45      0.32      0.33     61827
weighted avg       0.60      0.68      0.60     61827





Best Parameters for GloVe: {'classifier__solver': 'liblinear', 'classifier__C': 10}
GloVe
              precision    recall  f1-score   support

           0       0.42      0.15      0.22      5644
           1       0.43      0.00      0.00      3214
           2       0.19      0.01      0.01      4679
           3       0.22      0.01      0.01      8688
           4       0.66      0.98      0.79     39602

    accuracy                           0.65     61827
   macro avg       0.38      0.23      0.21     61827
weighted avg       0.53      0.65      0.53     61827


=== Naïve Bayes ===


6 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wr

Best Parameters for Word2Vec: {'classifier__var_smoothing': 1e-09, 'classifier__priors': None}
Word2Vec
              precision    recall  f1-score   support

           0       0.25      0.51      0.33      5644
           1       0.11      0.25      0.16      3214
           2       0.16      0.26      0.20      4679
           3       0.19      0.26      0.22      8688
           4       0.81      0.48      0.60     39602

    accuracy                           0.42     61827
   macro avg       0.30      0.35      0.30     61827
weighted avg       0.59      0.42      0.47     61827



6 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wr

Best Parameters for GloVe: {'classifier__var_smoothing': 1e-09, 'classifier__priors': None}
GloVe
              precision    recall  f1-score   support

           0       0.26      0.39      0.31      5644
           1       0.13      0.05      0.07      3214
           2       0.18      0.05      0.08      4679
           3       0.25      0.04      0.07      8688
           4       0.70      0.87      0.78     39602

    accuracy                           0.61     61827
   macro avg       0.30      0.28      0.26     61827
weighted avg       0.53      0.61      0.55     61827


=== Decision Tree ===
Best Parameters for Word2Vec: {'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 5, 'classifier__criterion': 'gini'}
Word2Vec
              precision    recall  f1-score   support

           0       0.32      0.05      0.08      5644
           1       1.00      0.00      0.00      3214
           2       1.00      0.00      0.00      4679
 

In [15]:
clf = LogisticRegression(solver='liblinear', C=10, max_iter=500)

In [16]:
pipeline_full = Pipeline([
    ('classifier', clf)
])

# Train the model on the entire training dataset
pipeline_full.fit(X_train_w2v, y_train_w2v)

# Predict on the test set
y_pred_full = pipeline_full.predict(X_test_w2v)

# Evaluate the model on the test set
accuracy_full = accuracy_score(y_test_w2v, y_pred_full)
print(f"Accuracy: {accuracy_full:.4f}")

# Print the classification report for more details
print("\nClassification Report:")
print(classification_report(y_test_w2v, y_pred_full, zero_division=1))


Accuracy: 0.6833

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.48      0.52      5644
           1       0.37      0.02      0.04      3214
           2       0.35      0.08      0.13      4679
           3       0.36      0.09      0.14      8688
           4       0.71      0.97      0.82     39602

    accuracy                           0.68     61827
   macro avg       0.47      0.33      0.33     61827
weighted avg       0.61      0.68      0.61     61827

