In [2]:
# Word2vec
import gensim
# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from keras.preprocessing.text import Tokenizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

import numpy as np
import pandas as pd
import re
import os
import string
import json

Using TensorFlow backend.


In [3]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [28]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [20]:
#stemmer = WordNetLemmatizer()
#rona, c19, c-19, cv19, cv-19, corona

def text_process(text):
    # Remove all the special characters
    #document = re.sub(r'\W', ' ', str(text))
    document = re.sub(r'https?:\S+|http?:\S', '', str(text))
    # remove all single characters
    #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    # Remove single characters from the start
    #document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    # remove punctuations
    document = document.translate(str.maketrans('', '', string.punctuation))
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    # remove first space
    document = re.sub(r'^\s+', '', document)
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    # Converting to Lowercase
    document = document.lower()
    # change covid 19 to covid19
    document = re.sub(r'covid(\s+|\-|\_)19', 'covid19', document)
    # change covid to covid19
    document = re.sub(r'covid(\s+|\W|$)', 'covid19', document)
    return(document)

In [29]:
preprocess("covid 19 and covid-19. and covid_19, and #covid?")

'covid 19 covid 19 covid 19 covid'

In [22]:
text_process("covid 19 and covid-19. and covid_19, and #covid?")

'covid19 and covid19 and covid19 and covid19'

In [108]:
# read dataset
df = pd.read_csv("../data/covid_usa_jan24-mar12_21pm-23pm_onethird_text.csv", encoding = "ISO-8859-1")
#df = pd.read_csv("../data/covid_usa_mar13-may25_21pm-23pm_onethird_text.csv", encoding = "ISO-8859-1")

In [109]:
df.textp = df.text.apply(lambda x: preprocess(x))
df.textp = df.textp.apply(lambda x: text_process(x))

  """Entry point for launching an IPython kernel.


In [52]:
%%time
documents = [_text.split() for _text in df.textp] 

CPU times: user 5.74 s, sys: 572 ms, total: 6.31 s
Wall time: 6.3 s


In [110]:
words = []
for t in df.textp:
    words += t.split()

In [111]:
c = Counter(words)

In [112]:
c50 = c.most_common(50)

In [113]:
with open("jan24-mar12_common_words50.txt", "w") as fp:
    json.dump(c50, fp)

In [53]:
w2v_model = gensim.models.word2vec.Word2Vec(size=200, 
                                            window=5, 
                                            min_count=10, 
                                            workers=8)

In [54]:
w2v_model.build_vocab(documents)
words = w2v_model.wv.vocab.keys()
#print(len(w2v_model.wv["u"]))
#print(list(w2v_model.wv.vocab.items())[0:5])
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 56605


In [55]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=32)

CPU times: user 48min 41s, sys: 11.7 s, total: 48min 53s
Wall time: 14min 16s


(1226241288, 1325684512)

In [56]:
w2v_model.save("mar13-may25_w2vmodel.w2v")

In [57]:
w2v_model.most_similar("covid19")

  """Entry point for launching an IPython kernel.


[('coronavirus', 0.8685397505760193),
 ('virus', 0.5763905048370361),
 ('also', 0.48496437072753906),
 ('amp', 0.4203587770462036),
 ('disease', 0.3924241065979004),
 ('many', 0.3548542559146881),
 ('means', 0.3501068949699402),
 ('cancer', 0.3478909432888031),
 ('cv', 0.32926028966903687),
 ('secondary', 0.3270239233970642)]

In [39]:
cosine_similarity([[1, 0, -1]], [[-1,-1, 0]])
type([[1, 0, -1]])

list

In [3]:
w2v_model1 = gensim.models.Word2Vec.load("jan24-mar12_w2vmodel.w2v")
w2v_model2 = gensim.models.Word2Vec.load("mar13-may25_w2vmodel.w2v")

In [4]:
def simi_compare(word1, word2):
    wd11 = w2v_model1.wv[word1]
    wd12 = w2v_model1.wv[word2]
    wd21 = w2v_model2.wv[word1]
    wd22 = w2v_model2.wv[word2]
    print("jan24-mar12: ", cosine_similarity([wd11], [wd12]))
    print("mar13-may25: ", cosine_similarity([wd21], [wd22]))

In [18]:
# china, cases, death, asian, black, distance, hoax
# metaphors, distrust, misinformation


simi_compare("covid19", "mistrust")

jan24-mar12:  [[-0.02310156]]
mar13-may25:  [[-0.00771967]]


In [60]:
wd1 = w2v_model.wv["covid19"]
wd2 = w2v_model.wv["china"]
cosine_similarity([wd1], [wd2])

array([[0.01677959]], dtype=float32)

In [33]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.textp)
print(list(tokenizer.word_index.items())[0:10])

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

[('coronavirus', 1), ('covid19', 2), ('trump', 3), ('people', 4), ('health', 5), ('amp', 6), ('us', 7), ('virus', 8), ('china', 9), ('cases', 10)]
Total words 101545
CPU times: user 9.04 s, sys: 3.64 ms, total: 9.04 s
Wall time: 9.04 s


In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE)) #Return a new array of given shape and type, filled with zeros.
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)
print(len(embedding_matrix[0]))