In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

from google.colab import drive
drive.mount('/content/drive')

import re # For regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


## (a) Load the dataset

In [2]:
tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Corona_Tweets.csv', names=['text'])

In [3]:
tweets.head()


Unnamed: 0,text
0,TRENDING: New Yorkers encounter empty supermar...
1,When I couldn't find hand sanitizer at Fred Me...
2,Find out how you can protect yourself and love...
3,#Panic buying hits #NewYork City as anxious sh...
4,#toiletpaper #dunnypaper #coronavirus #coronav...


In [4]:
def load_data():
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Corona_Tweets.csv', names=['text'])
    list_words = []
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      ### remove non-letter.
      text = re.sub('[^A-Za-z ]', '', (re.sub(r'http\S+', '', tweets.iloc[i,0])))
      ### tokenize
      words = text.split(" ")
      
      new_words = []
      ### iterate over all words of a tweet
      for w in words:
        #print('before: ',w)
        ## TODO: remove the stop words and convert a word (w) to the lower case
        w = w.lower()
        if w in stopwords.words('english') or w == '':
          continue
        #print('after: ', w)
        new_words.append(w)
      list_words.append(new_words)
    return list_words

# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:5])

[['trending', 'new', 'yorkers', 'encounter', 'empty', 'supermarket', 'shelves', 'pictured', 'wegmans', 'brooklyn', 'soldout', 'online', 'grocers', 'foodkick', 'maxdelivery', 'coronavirusfearing', 'shoppers', 'stock'], ['couldnt', 'find', 'hand', 'sanitizer', 'fred', 'meyer', 'turned', 'amazon', 'pack', 'purellcheck', 'coronavirus', 'concerns', 'driving', 'prices'], ['find', 'protect', 'loved', 'ones', 'coronavirus'], ['panic', 'buying', 'hits', 'newyork', 'city', 'anxious', 'shoppers', 'stock', 'foodampmedical', 'supplies', 'healthcare', 'worker', 'becomes', 'bigapple', 'st', 'confirmed', 'coronavirus', 'patient', 'bloomberg', 'staged', 'eventqanon', 'qanon', 'qanon', 'election', 'cdc'], ['toiletpaper', 'dunnypaper', 'coronavirus', 'coronavirusaustralia', 'coronavirusupdate', 'covid', 'news', 'corvid', 'newsmelb', 'dunnypapergate', 'costco', 'one', 'week', 'everyone', 'buying', 'baby', 'milk', 'powder', 'next', 'everyone', 'buying', 'toilet', 'paper']]


## (b) Create co-occurrence matrix

In [5]:
def distinct_words(corpus):
    """ get a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = 0
    # ------------------
    # TODO:
    # ------------------
    for sentence in corpus:
      for term in sentence:
        if term in corpus_words:
          continue
        else:
          corpus_words.append(term)
          num_corpus_words += 1
    return sorted(corpus_words), num_corpus_words

words, num_words = distinct_words(twitter_corpus)
print(words[:10], num_words)

['aadya', 'aamiin', 'aapl', 'abajam', 'abandon', 'abandoning', 'abc', 'abceyewitness', 'abeg', 'abid'] 12984


In [6]:
def compute_co_occurrence_matrix(corpus, window_size=5):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 5).    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape = [number of corpus words x number of corpus words]): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    M = None
    word2Ind = {}
    # ------------------
    # TODO:
    # ------------------
    distinctWords, numberOfWords = distinct_words(corpus)
    M = np.zeros((numberOfWords, numberOfWords), dtype=int)
    word2Ind = {key: value for value, key in enumerate(distinctWords)}

    for sentences in corpus:
      for t in range(len(sentences)):
        for i in range(1, window_size+1):
          if (t + i < len(sentences)):
            if (sentences[t] == sentences[t+i]):
              continue
            M[word2Ind.get(sentences[t])][word2Ind.get(sentences[t+i])] = M[word2Ind.get(sentences[t])][word2Ind.get(sentences[t+i])] + 1
            M[word2Ind.get(sentences[t+i])][word2Ind.get(sentences[t])] = M[word2Ind.get(sentences[t+i])][word2Ind.get(sentences[t])] + 1
            
    return M, word2Ind

M, word2Ind = compute_co_occurrence_matrix(twitter_corpus)

## (c) SVD

In [None]:
# -----------------------------
# Run SVD
# Note: This may take several minutes (~20-30 minutes)
# SVD and obtain word embeddings of size 75. [2 marks]
# ------------------------------

U, s, Vh = np.linalg.svd(M, full_matrices = False)
dim = 75

In [8]:
SVD_embeddings = U[:,:dim]
print(SVD_embeddings)

[[-2.98620315e-04 -3.84838798e-04  1.07257783e-03 ... -1.79690763e-03
  -1.26833970e-03  4.31516980e-03]
 [-3.03451388e-05  9.23632367e-06 -1.18706208e-05 ...  8.42969015e-04
  -1.16071723e-03  2.03855851e-03]
 [-2.10217284e-04  9.37793155e-04  2.42032372e-04 ...  8.91741421e-03
  -6.29989717e-03 -3.65969416e-04]
 ...
 [-4.44332667e-10 -2.52553047e-10  4.92604905e-10 ...  1.75982430e-05
  -1.66319485e-07  5.84566748e-06]
 [-2.97272153e-05  1.42203802e-05 -2.98434676e-05 ... -2.07913107e-03
  -4.11575974e-03 -5.22232518e-04]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


In [11]:
print(SVD_embeddings.shape)

(12984, 75)


## (d1) Word2Vec

In [12]:
# Creating the word2vec model and setting values for the various parameters

# Initializing the train model. 
num_features = 75   # Word vector dimensionality
min_word_count = 40  # Minimum word count. You can change it also.
num_workers = 4     # Number of parallel threads, can be changed
context = 10         # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed
# Initializing the train model
print("Training Word2Vec model....")
model = word2vec.Word2Vec(twitter_corpus,\
                          workers=num_workers,\
                          vector_size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

Training Word2Vec model....


  model.init_sims(replace=True)


## (d2) Compare SVD word embeddings with Word2Vec

In [60]:
from sklearn.metrics.pairwise import cosine_similarity

def svd_most_similar(query_word, n=10):
    """ return 'n' most similar words of a query word using the SVD word embeddings similar to word2vec's most_smilar    
        Params:
            query_word (strings): a query word
        Return:
            most_similar (list of strings): the list of 'n' most similar words
    """
    # get index of a query_word
    query_word_idx = word2Ind[query_word]
    # get word embedding for a query_word
    word = SVD_embeddings[query_word_idx]
    #cosine similarity matrix
    cos_similarity = cosine_similarity(SVD_embeddings, word.reshape(1, -1))
    most_similar = []
    """
    'Write additional code to compute the list most_similar. Each entry in the list is a tuple (w, cos)
    'where w is one of the most similar word to query_word and cos is cosine similarity of w with query_word
    """
    for w in words:
      if (w == query_word):
        continue
      cosi_similarity = cosine_similarity(SVD_embeddings[word2Ind[w]].reshape(1, -1), word.reshape(1, -1))
      most_similar.append((w, cosi_similarity[0][0].astype(float)))
    most_similar.sort(key = lambda x: x[1])
    most_similar.reverse()
    return most_similar[0:10]
    

## SVD vs Word2Vec: "???"

In [54]:
svd_most_similar("covid")

[('worldwarz', 0.1657200136350725),
 ('dayslater', 0.16542200057067208),
 ('iamlegend', 0.16534769853953288),
 ('mkendallbayareanewsgroupcom', 0.1583721799923436),
 ('insidejoke', 0.15267889123197742),
 ('massgovernor', 0.14342102979876528),
 ('youtuber', 0.1390676214007522),
 ('youtubechannel', 0.13826021340314062),
 ('trumppresser', 0.136026401351604),
 ('nowcoronapocalypse', 0.1345596531182594)]

In [55]:
model.wv.most_similar("covid") #this word2vec trained model on tweets

[('country', 0.9987647533416748),
 ('china', 0.9987171292304993),
 ('virus', 0.9986136555671692),
 ('ppl', 0.9985262155532837),
 ('products', 0.9985156655311584),
 ('first', 0.9984710812568665),
 ('soap', 0.9984516501426697),
 ('low', 0.9984477758407593),
 ('pandemic', 0.9984383583068848),
 ('corona', 0.9984352588653564)]

In [56]:
svd_most_similar("grocery")

[('accusationsht', 0.4511142908938376),
 ('dashpay', 0.4175264740339228),
 ('elys', 0.41711651697408986),
 ('effing', 0.3651110785251136),
 ('coindesk', 0.35647753349239264),
 ('llama', 0.3483185427475506),
 ('malware', 0.3295038384056452),
 ('pajama', 0.3241695638006173),
 ('maggi', 0.3160654991030448),
 ('nicroman', 0.3132467395475801)]

In [57]:
model.wv.most_similar("grocery")

[('went', 0.9961752891540527),
 ('tp', 0.995117723941803),
 ('ive', 0.9950107932090759),
 ('didnt', 0.9949583411216736),
 ('coronapocalypse', 0.9949297904968262),
 ('got', 0.9948993921279907),
 ('crazy', 0.9948153495788574),
 ('empty', 0.9947752952575684),
 ('every', 0.9947537183761597),
 ('lines', 0.9947482943534851)]