In [None]:
import os
import re
import ast
from pathlib import Path
import glob
import json
import pickle
import random
from tqdm import tqdm
import itertools
from collections import Counter
from dataclasses import dataclass, field
import contextlib

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.corpus.reader import CorpusReader
from nltk.internals import deprecated
from nltk.probability import FreqDist
from nltk.util import binary_search_file as _binary_search_file
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from textblob import TextBlob

import matplotlib.pyplot as plt

import torch
import torch.optim
import torch.nn as nn
import torch.utils.data
import h5py
from gensim.models.keyedvectors import KeyedVectors


# GloVe

In [None]:
### Define classes and functions for GloVe
@dataclass
class Vocabulary:
    token2index: dict = field(default_factory=dict)
    index2token: dict = field(default_factory=dict)
    token_counts: list = field(default_factory=list)
    _unk_token: int = field(init=False, default=-1)
    ### Define add
    def add(self, token):
        if token not in self.token2index:
            index = len(self)
            self.token2index[token] = index
            self.index2token[index] = token
            self.token_counts.append(0)
        self.token_counts[self.token2index[token]] += 1
    ### Define top k tokens
    def get_topk_subset(self, k):
        tokens = sorted(list(self.token2index.keys()),
                        key=lambda token: self.token_counts[self[token]],
                        reverse=True)
        return type(self)(token2index={token: index for index, token in enumerate(tokens[:k])},
                          index2token={index: token for index, token in enumerate(tokens[:k])},
                          token_counts=[self.token_counts[self.token2index[token]] for token in tokens[:k]])
    ### Define shuffle
    def shuffle(self):
        new_index = [_ for _ in range(len(self))]
        random.shuffle(new_index)
        new_token_counts = [None] * len(self)
        for token, index in zip(list(self.token2index.keys()), new_index):
            new_token_counts[index] = self.token_counts[self[token]]
            self.token2index[token] = index
            self.index2token[index] = token
        self.token_counts = new_token_counts
    ### Define get index
    def get_index(self, token):
        return self[token]
    ### Define get token
    def get_token(self, index):
        if not index in self.index2token:
            raise Exception("Invalid index.")
        return self.index2token[index]
    ### Define unknown token
    @property
    def unk_token(self):
        return self._unk_token
    ### Define getitem
    def __getitem__(self, token):
        if token not in self.token2index:
            return self._unk_token
        return self.token2index[token]
    ### Define len
    def __len__(self):
        return len(self.token2index)


### Define Vectorizer
@dataclass
class Vectorizer:
    vocab: Vocabulary
    ### Define from_corpus
    @classmethod
    def from_corpus(cls, corpus, vocab_size):
        vocab = Vocabulary()
        for token in corpus:
            vocab.add(token)
        vocab_subset = vocab.get_topk_subset(vocab_size)
        vocab_subset.shuffle()
        return cls(vocab_subset)
    ### Define vectorize
    def vectorize(self, corpus):
        return [self.vocab[token] for token in corpus]

### Define Cooccurrence Entries
@dataclass
class CooccurrenceEntries:
    vectorized_corpus: list
    vectorizer: Vectorizer
    ### Define setup
    @classmethod
    def setup(cls, corpus, vectorizer):
        return cls(vectorized_corpus=vectorizer.vectorize(corpus),
                   vectorizer=vectorizer)
    ### Define validate index
    def validate_index(self, index, lower, upper):
        is_unk = index == self.vectorizer.vocab.unk_token
        if lower < 0:
            return not is_unk
        return not is_unk and index >= lower and index <= upper
    ### Define build
    def build(self,
              window_size,
              num_partitions,
              chunk_size,
              output_directory=".",
              hdf5_directory = "."):
        partition_step = len(self.vectorizer.vocab) // num_partitions
        split_points = [0]
        while split_points[-1] + partition_step <= len(self.vectorizer.vocab):
            split_points.append(split_points[-1] + partition_step)
        split_points[-1] = len(self.vectorizer.vocab)
        for partition_id in tqdm(range(len(split_points) - 1)):
            index_lower = split_points[partition_id]
            index_upper = split_points[partition_id + 1] - 1
            cooccurr_counts = Counter()
            for i in tqdm(range(len(self.vectorized_corpus))):
                if not self.validate_index(self.vectorized_corpus[i],
                                           index_lower,
                                           index_upper):
                    continue
                
                context_lower = max(i - window_size, 0)
                context_upper = min(i + window_size + 1, len(self.vectorized_corpus))
                for j in range(context_lower, context_upper):
                    if i == j or not self.validate_index(self.vectorized_corpus[j],-1,-1):
                        continue
                    cooccurr_counts[(self.vectorized_corpus[i], self.vectorized_corpus[j])] += 1 / abs(i - j)
            cooccurr_dataset = np.zeros((len(cooccurr_counts), 3))
            for index, ((i, j), cooccurr_count) in enumerate(cooccurr_counts.items()):
                cooccurr_dataset[index] = (i, j, cooccurr_count)
            if partition_id == 0:
                file = h5py.File(hdf5_directory,"w")
                dataset = file.create_dataset("cooccurrence",
                                              (len(cooccurr_counts), 3),
                                              maxshape=(None, 3),
                                              chunks=(chunk_size, 3))
                prev_len = 0
            else:
                prev_len = dataset.len()
                dataset.resize(dataset.len() + len(cooccurr_counts), axis=0)
            dataset[prev_len: dataset.len()] = cooccurr_dataset
        file.close()
        with open(output_directory, "wb") as file:
            pickle.dump(self.vectorizer.vocab, file)

### Define Coocurrence Dataset
@dataclass
class CooccurrenceDataset(torch.utils.data.Dataset):
    token_ids: torch.Tensor
    cooccurr_counts: torch.Tensor
    ### Define getitem
    def __getitem__(self, index):
        return [self.token_ids[index], self.cooccurr_counts[index]]
    ### Define len
    def __len__(self):
        return self.token_ids.size()[0]

### Define Calculate Coocurrence     
def calculate_cooccurrence(corpus, config):
    ### Use vectorizer defined above
    vectorizer = Vectorizer.from_corpus(corpus=corpus,
                                        vocab_size=config["vocab_size"])
    ### Use cooccurrence entries defined above
    cooccurrence = CooccurrenceEntries.setup(corpus=corpus,
                                             vectorizer=vectorizer)
    cooccurrence.build(window_size=config["window_size"],
                       num_partitions=config["num_partitions"],
                       chunk_size=config["chunk_size"],
                       output_directory=config["cooccurrence_dir"], 
                       hdf5_directory = config["hdf5_file"]) 


### Define data loader
@dataclass
class HDF5DataLoader:
    filepath: str
    dataset_name: str
    batch_size: int
    device: str
    dataset: h5py.Dataset = field(init=False)
    ### Define iter_batches
    def iter_batches(self):
        chunks = list(self.dataset.iter_chunks())
        random.shuffle(chunks)
        for chunk in chunks:
            chunked_dataset = self.dataset[chunk]
            dataloader = torch.utils.data.DataLoader(dataset=CooccurrenceDataset(token_ids=torch.from_numpy(chunked_dataset[:,:2]).long(),
                                                                                 cooccurr_counts=torch.from_numpy(chunked_dataset[:,2]).float()),
                                                     batch_size=self.batch_size,
                                                     shuffle=True,
                                                     pin_memory=True)
            for batch in dataloader:
                batch = [_.to(self.device) for _ in batch]
                yield batch  
    ### Define open
    @contextlib.contextmanager
    def open(self):
        with h5py.File(self.filepath, "r") as file:
            self.dataset = file[self.dataset_name]
            yield


### Define GloVe
class GloVe(nn.Module):
    def __init__(self, vocab_size, embedding_size, x_max, alpha):
        super().__init__()
        self.weight = nn.Embedding(num_embeddings=vocab_size,
                                   embedding_dim=embedding_size,
                                   sparse=True)
        self.weight_tilde = nn.Embedding(num_embeddings=vocab_size,
                                         embedding_dim=embedding_size,
                                         sparse=True)
        self.bias = nn.Parameter(torch.randn(vocab_size,
                                             dtype=torch.float,))
        self.bias_tilde = nn.Parameter(torch.randn(vocab_size,
                                                   dtype=torch.float,))
        self.weighting_func = lambda x: (x / x_max).float_power(alpha).clamp(0, 1)
    def forward(self, i, j, x):
        loss = torch.mul(self.weight(i), self.weight_tilde(j)).sum(dim=1)
        loss = (loss + self.bias[i] + self.bias_tilde[j] - x.log()).square()
        loss = torch.mul(self.weighting_func(x), loss).mean()
        return loss

### Define train glove
def train_glove(config):
    dataloader = HDF5DataLoader(filepath=config["hdf5_file"],
                                dataset_name="cooccurrence",
                                batch_size=config["batch_size"],
                                device=config["device"])
    model = GloVe(vocab_size=config["vocab_size"],
                  embedding_size=config["embedding_size"],
                  x_max=config["x_max"],
                  alpha=config["alpha"])
    model.to(config["device"])
    optimizer = torch.optim.Adagrad(model.parameters())
    with dataloader.open():
        model.train()
        losses = []
        for epoch in tqdm(range(config["num_epochs"])):
            epoch_loss = 0
            for batch in tqdm(dataloader.iter_batches()):
                loss = model(batch[0][:, 0],
                             batch[0][:, 1],
                             batch[1])
                epoch_loss += loss.detach().item()
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            losses.append(epoch_loss)
            print(f"Epoch {epoch}: loss = {epoch_loss}")
            torch.save(model.state_dict(), config["output_filepath"])
    plt.plot(losses)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()
        

# Open corpus_prepost json as dictionary

In [None]:
f = open("corpus_prepost.json")
result_dict = json.load(f)
f.close()

f = open("config_prepost.json")
config = json.load(f)
f.close()

# Train GloVe

In [None]:
# for key in result_dict.keys():
#     corpus = result_dict[key]["corpus"]
#     config["input_filepath"] = f"{key}.csv"
#     config["output_filepath"] = f"output/{key}.pkl"
#     config["vocab_size"] = result_dict[key]["length"]
#     config["chunk_size"] = result_dict[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}.hdf5"
#     # only train for files that haven't been trained yet
#     if not os.path.isfile(config["output_filepath"]):
#         calculate_cooccurrence(corpus, config)
#         train_glove(config)


# Open corpus_full json as dictionary

In [None]:
f = open("corpus_full.json")
result_dict = json.load(f)
f.close()

f = open("config_full.json")
config = json.load(f)
f.close()

# Train GloVe

In [None]:
# for key in result_dict.keys():
#     corpus = result_dict[key]["corpus"]
#     config["input_filepath"] = f"{key}.csv"
#     config["output_filepath"] = f"output/{key}.pkl"
#     config["vocab_size"] = result_dict[key]["length"]
#     config["chunk_size"] = result_dict[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}.hdf5"
#     # only train for files that haven't been trained yet
#     if not os.path.isfile(config["output_filepath"]):
#         calculate_cooccurrence(corpus, config)
#         train_glove(config)


# Open corpus_asian json as dictionary

In [None]:
f = open("corpus_asian.json")
result_dict_asian = json.load(f)
f.close()

f = open("config_asian.json")
config = json.load(f)
f.close()

# Train GloVe

In [None]:
# for key in result_dict_asian.keys():
#     corpus = result_dict_asian[key]["corpus"]
#     config["input_filepath"] = f"{key}.csv"
#     config["output_filepath"] = f"output/{key}_asian.pkl"
#     config["vocab_size"] = result_dict_asian[key]["length"]
#     config["chunk_size"] = result_dict_asian[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}_asian.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}_asian.hdf5"
#     # only train for files that haven't been trained yet
#     if not os.path.isfile(config["output_filepath"]):
#         calculate_cooccurrence(corpus, config)
#         train_glove(config)



# Open corpus_covid json as dictionary

In [None]:
# f = open("corpus_covid.json")
# result_dict_covid = json.load(f)
# f.close()

# Train GloVe

In [None]:
# config = {"device": "cpu",
#           "window_size": 15,
#           "num_partitions": 15,
#           "x_max": 10,
#           "alpha": 0.75,
#           "batch_size": 32,
#           "num_epochs": 10,
#           "embedding_size": 50}

# for key in result_dict_covid.keys():
#     corpus = result_dict_covid[key]["corpus"]
#     config["input_filepath"] = f"{key}.csv"
#     config["output_filepath"] = f"output/{key}_covid.pkl"
#     config["vocab_size"] = result_dict_covid[key]["length"]
#     config["chunk_size"] = result_dict_covid[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}_covid.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}_covid.hdf5"
#     # only train for files that haven't been trained yet
#     if not os.path.isfile(config["output_filepath"]):
#         calculate_cooccurrence(corpus, config)
#         train_glove(config)


# Collect the vectors

In [None]:
# Open the stereotypes json

# f = open("stereotypes.json")
# stereotypes = json.load(f)
# f.close()

# Open the config jsons

# f = open("config_prepost.json")
# config_prepost = json.load(f)
# f.close()

# f = open("config_full.json")
# config_full = json.load(f)
# f.close()

# f = open("config_asian.json")
# config_asian = json.load(f)
# f.close()

# Pre Post corpus

In [None]:
# embeddings_prepost = {}

# path_to_folder = "output/"
# matching_files = glob.glob(path_to_folder+"*.pkl")

# for matching_file in matching_files:
#     new_name = matching_file[7:-4]
#     embeddings_prepost[new_name] = {}
#     embeddings_prepost[new_name]['database'] = []
#     embeddings_prepost[new_name]['category'] = []
#     embeddings_prepost[new_name]['word'] = []
#     embeddings_prepost[new_name]['vectors'] = []
    
#     # Load cooccurrence directory
#     cooc_outfile = config_prepost[new_name]["cooccurrence_dir"]
#     with open(cooc_outfile, "rb") as f:
#         vocab = pickle.load(f)
    
#     # Load model
#     model = GloVe(vocab_size=config_prepost[new_name]["vocab_size"],
#                   embedding_size=config_prepost[new_name]["embedding_size"],
#                   x_max=config_prepost[new_name]["x_max"],
#                   alpha=config_prepost[new_name]["alpha"])
#     outfile = config_prepost[new_name]["output_filepath"]
#     model.load_state_dict(torch.load(outfile)) 

#     # Get Keyed Vectors
#     keyed_vectors = KeyedVectors(vector_size=config_prepost[new_name]["embedding_size"])
#     keyed_vectors.add_vectors(keys=[vocab.get_token(index) for index in range(config_prepost[new_name]["vocab_size"])],
#                               weights=(model.weight.weight.detach() + model.weight_tilde.weight.detach()).numpy())

#     # Go through all the categories
#     for key in stereotypes.keys():
#         # Go through all the words
#         for word in stereotypes[key]:
#             word = word.lower()
#             # Get vectors
#             if keyed_vectors.__contains__(word):
#                 weights = keyed_vectors.get_vector(word, norm=False)
#                 weights = weights.tolist()
#                 # Now append everything
#                 embeddings_prepost[new_name]['database'].append("glove")
#                 embeddings_prepost[new_name]['category'].append(key)
#                 embeddings_prepost[new_name]['word'].append(word)
#                 embeddings_prepost[new_name]['vectors'].append(weights)
                

## Full corpus

In [None]:
# embeddings_full = {}

# path_to_folder = "output/Full/"
# matching_files = glob.glob(path_to_folder+"*.pkl")
# for matching_file in matching_files:
#     new_name = matching_file[-14:-4]
#     config_name = f"{matching_file[-23:-4]}"
#     embeddings_full[new_name] = {}
#     embeddings_full[new_name]['database'] = []
#     embeddings_full[new_name]['category'] = []
#     embeddings_full[new_name]['word'] = []
#     embeddings_full[new_name]['vectors'] = []
    
#     # Load cooccurrence directory
#     cooc_outfile = config_full[config_name]["cooccurrence_dir"]
#     vocab_file = f"{cooc_outfile[0:23]}Full{cooc_outfile[22:]}"
#     with open(vocab_file, "rb") as f:
#         vocab = pickle.load(f)
#     # Load model
#     model = GloVe(vocab_size=config_full[config_name]["vocab_size"],
#                   embedding_size=config_full[config_name]["embedding_size"],
#                   x_max=config_full[config_name]["x_max"],
#                   alpha=config_full[config_name]["alpha"])
#     outfile = config_full[config_name]["output_filepath"]
#     model_file = f"{outfile[0:7]}Full{outfile[6:]}"
#     model.load_state_dict(torch.load(model_file)) 

#     # Get Keyed Vectors
#     keyed_vectors = KeyedVectors(vector_size=config_full[config_name]["embedding_size"])
#     keyed_vectors.add_vectors(keys=[vocab.get_token(index) for index in range(config_full[config_name]["vocab_size"])],
#                               weights=(model.weight.weight.detach() + model.weight_tilde.weight.detach()).numpy())
#     # Go through all the categories
#     for key in stereotypes.keys():
#         # Go through all the words
#         for word in stereotypes[key]:
#             word = word.lower()
#             # Get vectors
#             if keyed_vectors.__contains__(word):
#                 weights = keyed_vectors.get_vector(word, norm=False)
#                 weights = weights.tolist()
#                 # Now append everything
#                 embeddings_full[new_name]['database'].append("glove")
#                 embeddings_full[new_name]['category'].append(key)
#                 embeddings_full[new_name]['word'].append(word)
#                 embeddings_full[new_name]['vectors'].append(weights)
                

## Asian corpus

In [None]:
# embeddings_asian = {}

# path_to_folder = "output/Asian/"
# matching_files = glob.glob(path_to_folder+"*.pkl")
# for matching_file in matching_files:
#     new_name = matching_file[-20:-10]
#     config_name = f"{matching_file[-29:-10]}"
#     embeddings_asian[new_name] = {}
#     embeddings_asian[new_name]['database'] = []
#     embeddings_asian[new_name]['category'] = []
#     embeddings_asian[new_name]['word'] = []
#     embeddings_asian[new_name]['vectors'] = []
    
#     # Load cooccurrence directory
#     cooc_outfile = config_asian[config_name]["cooccurrence_dir"]
#     vocab_file = f"{cooc_outfile[0:23]}Asian{cooc_outfile[22:-4]}_asian.pkl"
#     with open(vocab_file, "rb") as f:
#         vocab = pickle.load(f)
#     # Load model
#     model = GloVe(vocab_size=config_asian[config_name]["vocab_size"],
#                   embedding_size=config_asian[config_name]["embedding_size"],
#                   x_max=config_asian[config_name]["x_max"],
#                   alpha=config_asian[config_name]["alpha"])
#     outfile = config_asian[config_name]["output_filepath"]
#     model_file = f"{outfile[0:7]}Asian{outfile[6:-4]}_asian.pkl"
#     model.load_state_dict(torch.load(model_file)) 

#     # Get Keyed Vectors
#     keyed_vectors = KeyedVectors(vector_size=config_asian[config_name]["embedding_size"])
#     keyed_vectors.add_vectors(keys=[vocab.get_token(index) for index in range(config_asian[config_name]["vocab_size"])],
#                               weights=(model.weight.weight.detach() + model.weight_tilde.weight.detach()).numpy())
#     # Go through all the categories
#     for key in stereotypes.keys():
#         # Go through all the words
#         for word in stereotypes[key]:
#             word = word.lower()
#             # Get vectors
#             if keyed_vectors.__contains__(word):
#                 weights = keyed_vectors.get_vector(word)
#                 weights = weights.tolist()
#                 # Now append everything
#                 embeddings_asian[new_name]['database'].append("glove")
#                 embeddings_asian[new_name]['category'].append(key)
#                 embeddings_asian[new_name]['word'].append(word)
#                 embeddings_asian[new_name]['vectors'].append(weights)
    

# Convert to dataframes

In [None]:
# df_prepost = pd.DataFrame(columns = ['day', 'database', 'category', 'word', 'vectors'])

# for day in embeddings_prepost.keys():
#     df_prepost_day = pd.DataFrame.from_dict(embeddings_prepost[day], orient="columns")
#     df_prepost_day['day'] = day
#     df_prepost = df_prepost.append(df_prepost_day)
    
# df_prepost = df_prepost.sort_values(by="day")

# df_prepost.to_csv('embeddings_prepost.csv', index = False)

## Check duplicates as sanity check
# df_prepost.loc[df_prepost['vectors'].astype(str).drop_duplicates().index]

In [None]:
# df_full = pd.DataFrame(columns = ['day', 'database', 'category', 'word', 'vectors'])

# for day in embeddings_full.keys():
#     df_full_day = pd.DataFrame.from_dict(embeddings_full[day], orient="columns")
#     df_full_day['day'] = day
#     df_full = df_full.append(df_full_day)
    
# df_full = df_full.sort_values(by="day")

# df_full.to_csv('embeddings_full.csv', index = False)

## Check duplicates as sanity check
# df_full.loc[df_full['vectors'].astype(str).drop_duplicates().index]

In [None]:
# df_asian = pd.DataFrame(columns = ['day', 'database', 'category', 'word', 'vectors'])

# for day in embeddings_asian.keys():
#     df_asian_day = pd.DataFrame.from_dict(embeddings_asian[day], orient="columns")
#     df_asian_day['day'] = day
#     df_asian = df_asian.append(df_asian_day)

# df_asian = df_asian.sort_values(by="day")

# df_asian.to_csv('embeddings_asian.csv', index = False)

## Check duplicates as sanity check
# df_asian.loc[df_asian['vectors'].astype(str).drop_duplicates().index]