Look into https://dylancastillo.co/nlp-snippets-clean-and-tokenize-text-with-python/ for better tokenization and cleaning.

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')

import numpy as np
import pandas as pd
from gensim.models import word2vec

from google.colab import drive
drive.mount('/content/drive')

import re # For regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


## (a) Load the dataset

In [2]:
tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Corona_Tweets.csv', names=['text'])

In [3]:
tweets.head()


Unnamed: 0,text
0,TRENDING: New Yorkers encounter empty supermar...
1,When I couldn't find hand sanitizer at Fred Me...
2,Find out how you can protect yourself and love...
3,#Panic buying hits #NewYork City as anxious sh...
4,#toiletpaper #dunnypaper #coronavirus #coronav...


In [83]:
def load_data():
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    """ Read tweets from the file.
        Return:
            list of lists (list_words), with words from each of the processed tweets
    """
    tweets = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Corona_Tweets.csv', names=['text'])
    list_words = []
    ### iterate over all tweets from the dataset
    for i in tweets.index:
      ### remove non-letter.
      text = re.sub('[^A-Za-z ]', ' ', (re.sub(r'http\S+', '', tweets.iloc[i,0])))
      ### tokenize
      sentences = tokenizer.tokenize(text.strip())
      
      new_words = []
      ### iterate over all words of a tweet
      for s in sentences:
        for w in s.split(" "):
          #print('before: ',w)
          ## TODO: remove the stop words and convert a word (w) to the lower case
          w = w.lower()
          if w in stopwords.words('english') or w == '':
            continue
          #print('after: ', w)
          new_words.append(w)
        list_words.append(new_words)
    return list_words

# check a few samples of twitter corpus
twitter_corpus = load_data()
print(twitter_corpus[:5])

[['trending', 'new', 'yorkers', 'encounter', 'empty', 'supermarket', 'shelves', 'pictured', 'wegmans', 'brooklyn', 'sold', 'online', 'grocers', 'foodkick', 'maxdelivery', 'coronavirus', 'fearing', 'shoppers', 'stock'], ['find', 'hand', 'sanitizer', 'fred', 'meyer', 'turned', 'amazon', 'pack', 'purell', 'check', 'coronavirus', 'concerns', 'driving', 'prices'], ['find', 'protect', 'loved', 'ones', 'coronavirus'], ['panic', 'buying', 'hits', 'newyork', 'city', 'anxious', 'shoppers', 'stock', 'food', 'amp', 'medical', 'supplies', 'healthcare', 'worker', 'becomes', 'bigapple', 'st', 'confirmed', 'coronavirus', 'patient', 'bloomberg', 'staged', 'event', 'qanon', 'qanon', 'qanon', 'election', 'cdc'], ['toiletpaper', 'dunnypaper', 'coronavirus', 'coronavirusaustralia', 'coronavirusupdate', 'covid', 'news', 'corvid', 'newsmelb', 'dunnypapergate', 'costco', 'one', 'week', 'everyone', 'buying', 'baby', 'milk', 'powder', 'next', 'everyone', 'buying', 'toilet', 'paper']]


## (b) Create co-occurrence matrix

In [84]:
def distinct_words(corpus):
    """ get a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = 0
    # ------------------
    # TODO:
    # ------------------
    for sentence in corpus:
      for term in sentence:
        if term in corpus_words:
          continue
        else:
          corpus_words.append(term)
          num_corpus_words += 1
    return sorted(corpus_words), num_corpus_words

words, num_words = distinct_words(twitter_corpus)
print(words[:10], num_words)

['aadya', 'aadyasitara', 'aamiin', 'aapl', 'abajam', 'abandon', 'abandoning', 'abc', 'abeg', 'abid'] 11329


In [85]:
def compute_co_occurrence_matrix(corpus, window_size=5):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 5).    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape = [number of corpus words x number of corpus words]): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    M = None
    word2Ind = {}
    # ------------------
    # TODO:
    # ------------------
    distinctWords, numberOfWords = distinct_words(corpus)
    M = np.zeros((numberOfWords, numberOfWords), dtype=int)
    word2Ind = {key: value for value, key in enumerate(distinctWords)}

    for sentences in corpus:
      for t in range(len(sentences)):
        for i in range(1, window_size+1):
          if (t + i < len(sentences)):
            if (sentences[t] == sentences[t+i]):
              continue
            M[word2Ind.get(sentences[t])][word2Ind.get(sentences[t+i])] = M[word2Ind.get(sentences[t])][word2Ind.get(sentences[t+i])] + 1
            M[word2Ind.get(sentences[t+i])][word2Ind.get(sentences[t])] = M[word2Ind.get(sentences[t+i])][word2Ind.get(sentences[t])] + 1
            
    return M, word2Ind

M, word2Ind = compute_co_occurrence_matrix(twitter_corpus)

## (c) SVD

In [86]:
# -----------------------------
# Run SVD
# Note: This may take several minutes (~20-30 minutes)
# SVD and obtain word embeddings of size 75. [2 marks]
# ------------------------------

U, s, Vh = np.linalg.svd(M, full_matrices = False)
dim = 75

In [87]:
SVD_embeddings = U[:,:dim]
print(SVD_embeddings)

[[-2.89359061e-04  8.94298798e-04 -2.00384421e-04 ... -1.41461670e-03
  -2.83131694e-03 -4.51807535e-04]
 [-5.78648725e-05 -2.60518319e-05  7.59015348e-07 ... -1.47781419e-03
  -2.66428199e-03 -4.76970458e-04]
 [-2.77121697e-05 -1.87857091e-05 -3.33057849e-06 ... -6.17710485e-03
  -2.50990161e-03  4.59517934e-04]
 ...
 [-2.29894553e-04 -9.92902168e-05 -9.54370402e-06 ... -2.22686922e-03
   3.02238595e-03  3.54157679e-03]
 [-1.69065264e-04 -1.35788345e-04  1.94673289e-05 ... -7.71412145e-04
  -2.07956828e-04 -9.30110533e-04]
 [-6.83880735e-06 -9.96293171e-06  5.06565824e-06 ... -1.83436799e-04
   5.19580968e-05  1.35731535e-04]]


In [88]:
print(SVD_embeddings.shape)

(11329, 75)


## (d1) Word2Vec

In [89]:
# Creating the word2vec model and setting values for the various parameters

# Initializing the train model. 
num_features = 75   # Word vector dimensionality
min_word_count = 40  # Minimum word count. You can change it also.
num_workers = 4     # Number of parallel threads, can be changed
context = 10         # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words, can be changed
# Initializing the train model
print("Training Word2Vec model....")
model = word2vec.Word2Vec(twitter_corpus,\
                          workers=num_workers,\
                          vector_size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

Training Word2Vec model....


  model.init_sims(replace=True)


## (d2) Compare SVD word embeddings with Word2Vec

In [90]:
from sklearn.metrics.pairwise import cosine_similarity

def svd_most_similar(query_word, n=10):
    """ return 'n' most similar words of a query word using the SVD word embeddings similar to word2vec's most_smilar    
        Params:
            query_word (strings): a query word
        Return:
            most_similar (list of strings): the list of 'n' most similar words
    """
    # get index of a query_word
    query_word_idx = word2Ind[query_word]
    # get word embedding for a query_word
    word = SVD_embeddings[query_word_idx]
    #cosine similarity matrix
    cos_similarity = cosine_similarity(SVD_embeddings, word.reshape(1, -1))
    most_similar = []
    """
    'Write additional code to compute the list most_similar. Each entry in the list is a tuple (w, cos)
    'where w is one of the most similar word to query_word and cos is cosine similarity of w with query_word
    """
    for w in words:
      if (w == query_word):
        continue
      cosi_similarity = cosine_similarity(SVD_embeddings[word2Ind[w]].reshape(1, -1), word.reshape(1, -1))
      most_similar.append((w, cosi_similarity[0][0].astype(float)))
    most_similar.sort(key = lambda x: x[1])
    most_similar.reverse()
    return most_similar[0:10]
    

## SVD vs Word2Vec: "???"

In [91]:
svd_most_similar("covid")

[('dayslater', 0.15211207951811148),
 ('bayareanewsgroup', 0.1518003966546849),
 ('iamlegend', 0.15143269034472234),
 ('insidejoke', 0.14743165680859183),
 ('mkendall', 0.14216628913308402),
 ('worldwarz', 0.13651493010346166),
 ('trumppresser', 0.12901769515039543),
 ('ventilators', 0.12666162004513276),
 ('canadaprmasterclass', 0.11511970555996316),
 ('youtuber', 0.11106161290140464)]

In [92]:
model.wv.most_similar("covid") #this word2vec trained model on tweets

[('much', 0.998649537563324),
 ('going', 0.9984346628189087),
 ('left', 0.9984264373779297),
 ('back', 0.9982723593711853),
 ('virus', 0.9982688426971436),
 ('selling', 0.998207688331604),
 ('good', 0.9981772899627686),
 ('right', 0.9981626868247986),
 ('like', 0.9981588125228882),
 ('stocked', 0.9981558918952942)]

In [93]:
svd_most_similar("grocery")

[('ht', 0.4620264362211107),
 ('accusations', 0.45600531657149773),
 ('elys', 0.41677304369122675),
 ('dashpay', 0.3996438569491531),
 ('pleaselike', 0.36164498195548106),
 ('effing', 0.36082555543703004),
 ('malware', 0.35626459489036155),
 ('llama', 0.34977194944182416),
 ('coindesk', 0.34647039787325523),
 ('pajama', 0.34066901693332785)]

In [94]:
model.wv.most_similar("grocery")

[('went', 0.9970299005508423),
 ('coronapocalypse', 0.9969602823257446),
 ('got', 0.9968372583389282),
 ('empty', 0.9966487288475037),
 ('supermarket', 0.9966257214546204),
 ('gone', 0.9966016411781311),
 ('store', 0.9964069128036499),
 ('yesterday', 0.9963367581367493),
 ('coronaviruspandemic', 0.9961442947387695),
 ('still', 0.9961057901382446)]