<a href="https://colab.research.google.com/github/KayvanShah1/usc-csci-544-assignments-hw/blob/main/hw3/CSCI544_HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

## Install

In [8]:
!pip install contractions
!pip install ipython-autotime

time: 33.1 s (started: 2023-10-14 05:54:27 +00:00)


## Imports

In [69]:
import os
import re
import unicodedata

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

import contractions

import gensim
import gensim.downloader as api
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC

import torch
from torch.utils.data import Dataset, DataLoader

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 457 ms (started: 2023-10-14 08:28:45 +00:00)


# Config

In [86]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/CSCI544/HW3")

class DatasetConfig:
    RANDOM_STATE = 34
    TEST_SPLIT = 0.2
    N_SAMPLES_EACH_CLASS = 50000
    DATA_PATH = "amazon_reviews_us_Office_Products_v1_00.tsv.gz"
    PROCESSED_DATA_PATH = "amazon_review_preprocessed_sentiment_analysis.csv"


class Word2VecConfig:
    PRETRAINED_MODEL = "word2vec-google-news-300"
    PRETRAINED_MODEL_SAVE_PATH = f"./{PRETRAINED_MODEL}/{PRETRAINED_MODEL}.gz"
    WINDOW_SIZE = 13
    MAX_LENGTH = 300
    MIN_WORD_COUNT = 9
    CUSTOM_MODEL_PATH = "word2vec-custom.model"

time: 2.07 ms (started: 2023-10-14 08:52:55 +00:00)


# Dataset Preparation

## Read and Process

In [33]:
class LoadData:
    @staticmethod
    def load_data(path):
        df = pd.read_csv(
            path,
            sep="\t",
            usecols=["review_headline", "review_body", "star_rating"],
            on_bad_lines="skip",
            memory_map=True,
        )
        return df


class ProcessData:
    @staticmethod
    def filter_columns(df):
        return df.loc[:, ["review_body", "star_rating"]]

    @staticmethod
    def convert_star_rating(df):
        df["star_rating"] = pd.to_numeric(df["star_rating"], errors="coerce")
        df.dropna(subset=["star_rating"], inplace=True)
        return df

    @staticmethod
    def classify_sentiment(df):
        df["sentiment"] = df["star_rating"].apply(lambda x: 1 if x <= 3 else 2)
        return df

    @staticmethod
    def sample_data(df, n_samples, random_state):
        sampled_df = pd.concat(
            [
                df.query("sentiment==1").sample(n=n_samples, random_state=random_state),
                df.query("sentiment==2").sample(n=n_samples, random_state=random_state),
            ],
            ignore_index=True,
        ).sample(frac=1, random_state=random_state, ignore_index=True)

        sampled_df.drop(columns=["star_rating"], inplace=True)
        return sampled_df


class CleanText:
    @staticmethod
    def unicode_to_ascii(s):
        return "".join(
            c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
        )

    @staticmethod
    def expand_contractions(text):
        return contractions.fix(text)

    @staticmethod
    def remove_email_addresses(text):
        return re.sub(r"[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5}", "", text)

    @staticmethod
    def remove_urls(text):
        return re.sub(r"\bhttps?:\/\/\S+|www\.\S+", "", text)

    @staticmethod
    def remove_html_tags(text):
        return re.sub(r"<.*?>", "", text)

    @staticmethod
    def clean_text(text):
        text = CleanText.unicode_to_ascii(text.lower().strip())

        # replacing email addresses with empty string
        # text = CleanText.remove_email_addresses(text)

        # replacing urls with empty string
        # text = CleanText.remove_urls(text)

        # Remove HTML tags
        text = CleanText.remove_html_tags(text)

        # Expand contraction for eg., wouldn't => would not
        text = CleanText.expand_contractions(text)

        # creating a space between a word and the punctuation following it
        text = re.sub(r"([?.!,¿])", r" \1 ", text)
        text = re.sub(r'[" "]+', " ", text)

        # removes all non-alphabetical characters
        # text = re.sub(r"[^a-zA-Z\s]+", "", text)

        # remove extra spaces
        # text = re.sub(" +", " ", text)
        text = text.strip()
        return text


class PreprocessText:
    lemmatizer = WordNetLemmatizer()

    @staticmethod
    def get_stopwords_pattern():
        # Stopword list
        og_stopwords = set(stopwords.words("english"))

        # Define a list of negative words to remove
        neg_words = ["no", "not", "nor", "neither", "none", "never", "nobody", "nowhere"]
        custom_stopwords = [word for word in og_stopwords if word not in neg_words]
        pattern = re.compile(r"\b(" + r"|".join(og_stopwords) + r")\b\s*")
        return pattern

    @staticmethod
    def pos_tagger(tag):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    @staticmethod
    def lemmatize_text_using_pos_tags(text):
        words = nltk.pos_tag(word_tokenize(text))
        words = map(lambda x: (x[0], PreprocessText.pos_tagger(x[1])), words)
        lemmatized_words = [
            PreprocessText.lemmatizer.lemmatize(word, tag) if tag else word for word, tag in words
        ]
        return " ".join(lemmatized_words)

    @staticmethod
    def lemmatize_text(text):
        words = word_tokenize(text)
        lemmatized_words = [PreprocessText.lemmatizer.lemmatize(word) for word in words]
        return " ".join(lemmatized_words)

    pattern = get_stopwords_pattern()

    @staticmethod
    def preprocess_text(text):
        # replacing all the stopwords
        # text = PreprocessText.pattern.sub("", text)
        # text = PreprocessText.lemmatize_text(text)
        return text


clean_text_vect = np.vectorize(CleanText.clean_text)
preprocess_text_vect = np.vectorize(PreprocessText.preprocess_text)


def clean_and_process_data(path):
    df = LoadData.load_data(path)
    df_filtered = ProcessData.filter_columns(df)
    df_filtered = ProcessData.convert_star_rating(df_filtered)
    df_filtered = ProcessData.classify_sentiment(df_filtered)

    balanced_df = ProcessData.sample_data(
        df_filtered, DatasetConfig.N_SAMPLES_EACH_CLASS, DatasetConfig.RANDOM_STATE
    )

    balanced_df["review_body"] = balanced_df["review_body"].astype(str)

    # Clean data
    # avg_len_before_clean = balanced_df["review_body"].apply(len).mean()
    balanced_df["review_body"] = balanced_df["review_body"].apply(clean_text_vect)
    # Drop reviews that are empty
    balanced_df = balanced_df.loc[balanced_df["review_body"].str.strip() != ""]
    # avg_len_after_clean = balanced_df["review_body"].apply(len).mean()

    # Preprocess data
    # avg_len_before_preprocess = avg_len_after_clean
    # balanced_df["review_body"] = balanced_df["review_body"].apply(preprocess_text_vect)
    # avg_len_after_preprocess = balanced_df["review_body"].apply(len).mean()

    # Print Results
    # print(f"{avg_len_before_clean:.2f}, {avg_len_after_clean:.2f}")
    # print(f"{avg_len_before_preprocess:.2f}, {avg_len_after_preprocess:.2f}")
    return balanced_df


def get_reviews_dataset(new=False):
    if new or not os.path.exists(DatasetConfig.DATA_PATH):
        balanced_df = clean_and_process_data(DatasetConfig.DATA_PATH)
        balanced_df.to_csv(DatasetConfig.PROCESSED_DATA_PATH, index=False)
    else:
        balanced_df = pd.read_csv(DatasetConfig.PROCESSED_DATA_PATH)
    return balanced_df

time: 4.94 ms (started: 2023-10-14 07:04:57 +00:00)


In [77]:
balanced_df = get_reviews_dataset(new=False)
balanced_df.dropna(inplace=True)
print("Total Records:", balanced_df.shape)
balanced_df.head(10)

Total Records: (99998, 2)


Unnamed: 0,review_body,sentiment
0,i set up a photo booth at my sister's wedding ...,2
1,"like everyone else , i like saving money , so ...",1
2,the pen is perfect what i want ! however the i...,2
3,i think they are too expensive for their quali...,1
4,"black is working wonderfully , and both cartri...",1
5,i have problems with the moveable tab ! it see...,1
6,this printer sucks ! it started out working wo...,1
7,the ink on these cartridges leak . i would ret...,1
8,it gets points for working as designed; from w...,2
9,"i ordered these and they work just fine , but ...",1


time: 963 ms (started: 2023-10-14 08:38:41 +00:00)


## Create a PyTorch Dataset

In [49]:
class AmazonReviewsSentimentDataset(Dataset):
    def __init__(self, df, word2vec_model):
        self.data = df
        self.word2vec_model = word2vec_model

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if idx >= self.__len__:
            raise IndexError

        text = self.data.iloc[idx]['review_body']
        text = word_tokenize(text)

        label = self.data.iloc[idx]['sentiment']

        # Retrieve word embeddings
        embeddings = [self.word2vec_model[word] for word in text if word in self.word2vec_model]
        embeddings = np.mean(embeddings, axis=0)

        return {
            "embeddings": torch.tensor(embeddings, dtype=torch.long),
            "target":  torch.tensor(label, dtype=torch.long)
        }

time: 1.42 ms (started: 2023-10-14 07:57:07 +00:00)


## Train and Test Spilts

In [79]:
train_df, test_df = train_test_split(
    balanced_df,
    test_size=DatasetConfig.TEST_SPLIT,
    random_state=DatasetConfig.RANDOM_STATE,
    stratify=balanced_df["sentiment"]
)

time: 93.2 ms (started: 2023-10-14 08:39:00 +00:00)


# Word Embedding
- Run the `api.load()` once and copied the model from temporary path to local drive for fast loading of model in memory.

### References:
1. [Faster way to load word2vec model](https://github.com/RaRe-Technologies/gensim/issues/2642)
2. [Tutorial](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py)

## Download & Save Word2Vec pretrained model

In [16]:
# Copy model to current directory
# !mkdir word2vec-google-news-300
# !cp /root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz ./word2vec-google-news-300

time: 14.8 s (started: 2023-10-14 06:03:02 +00:00)


In [21]:
def load_pretrained_model():
    if not os.path.exists(Word2VecConfig.PRETRAINED_MODEL_SAVE_PATH):
        pretrained_model = api.load(Word2VecConfig.PRETRAINED_MODEL, return_path=True)
    else:
        pretrained_model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(
            Word2VecConfig.PRETRAINED_MODEL_SAVE_PATH, binary=True
        )
    return pretrained_model

pretrained_model = load_pretrained_model()

time: 1min 17s (started: 2023-10-14 06:15:55 +00:00)


### Semantic similarity examples

In [44]:
# Example 1: King - Man + Woman = Queen
result = pretrained_model.most_similar(positive=['woman', 'king'], negative=['man'])
print(f"Semantic Similarity: {result[0][0]}")

# Example 2: excellent ~ outstanding
result = pretrained_model.similarity('excellent', 'outstanding')
print(f"Semantic Similarity: {result}")

# Example 3: Paris - France + Italy = Milan
result = pretrained_model.most_similar(positive=['Italy', 'Paris'], negative=['France'])
print(f"Semantic Similarity: {result[0][0]}")

# Example 4: Car - Wheel + Boat = Yacht
result = pretrained_model.most_similar(positive=['Boat', 'Car'], negative=['Wheel'])
print(f"Semantic Similarity: {result[0][0]}")

# Example 5: Delicious ~ Tasty
result = pretrained_model.similarity('Delicious', 'Tasty')
print(f"Semantic Similarity: {result}")

# Example 6: Computer ~ Plant
result = pretrained_model.similarity('Computer', 'Plant')
print(f"Semantic Similarity: {result}")

# Example 7: Cat ~ Dog
result = pretrained_model.similarity('Cat', 'Dog')
print(f"Semantic Similarity: {result}")

Semantic Similarity: queen
Semantic Similarity: 0.5567485690116882
Semantic Similarity: Milan
Semantic Similarity: Yacht
Semantic Similarity: 0.5718502402305603
Semantic Similarity: 0.04445184767246246
Semantic Similarity: 0.6061107516288757
time: 2.9 s (started: 2023-10-14 07:42:09 +00:00)


## Custom Word2Vec Embeddings Generation

In [80]:
sentences = train_df["review_body"].apply(word_tokenize)

time: 56.5 s (started: 2023-10-14 08:39:06 +00:00)


In [82]:
# Train Word2Vec model
w2v_model_custom = Word2Vec(
    sentences=sentences,
    vector_size=Word2VecConfig.MAX_LENGTH,
    window=Word2VecConfig.WINDOW_SIZE,
    min_count=Word2VecConfig.MIN_WORD_COUNT
)

# Save the model
w2v_model_custom.save(Word2VecConfig.CUSTOM_MODEL_PATH)

time: 1min 20s (started: 2023-10-14 08:42:08 +00:00)


### Test Custom Embeddings

In [83]:
# Load the custom model
w2v_model_custom = Word2Vec.load(Word2VecConfig.CUSTOM_MODEL_PATH)

# Example 1: King - Man + Woman = Queen
res = w2v_model_custom.wv.most_similar(positive=['woman', 'king'], negative=['man'])
print(f"Semantic Similarity (Custom Model): {res[0]}")

# Example 2: excellent ~ outstanding
res = w2v_model_custom.wv.similarity('excellent', 'outstanding')
print(f"Semantic Similarity (Custom Model): {res}")

Semantic Similarity (Custom Model): ('queen', 0.5496216416358948)
Semantic Similarity (Custom Model): 0.7910025715827942
time: 140 ms (started: 2023-10-14 08:43:50 +00:00)


# Simple Models

# Feedforward Neural Networks

# Recurrent Neural Networks

# End of File