#Implement word imbeddings

# Preprocess

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

df = pd.read_json('/content/drive/MyDrive/NLP 7th sem/dataset/Sarcasm_Headlines_Dataset_v2.json', lines = True)
print(df.shape)

df.head()

(28619, 3)


Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return filtered_tokens

In [4]:
text_corpus = df['headline'].apply(preprocess)

In [5]:
text_corpus[0]

['thirtysomething', 'scientist', 'unveil', 'doomsday', 'clock', 'hair', 'loss']

In [6]:
!pip install gensim
!pip install python-Levenshtein



#Word2vec

In [7]:
import gensim

model = gensim.models.Word2Vec(
    window=10,
    min_count=4,
)

In [8]:
model.build_vocab(text_corpus, progress_per=1000)

In [9]:
model.train(text_corpus, total_examples=model.corpus_count, epochs=model.epochs)


(871196, 994290)

In [27]:
model.save("/content/drive/MyDrive/NLP 7th sem/weights/word2vec/word2vec-sarcasm.model")


In [10]:
model.wv.most_similar("good")



[('time', 0.9987227320671082),
 ('friend', 0.9986704587936401),
 ('get', 0.9986450672149658),
 ('mom', 0.9986127614974976),
 ('way', 0.9984536170959473),
 ('girlfriend', 0.998231828212738),
 ('idea', 0.9981838464736938),
 ('dad', 0.998181164264679),
 ('die', 0.9981737732887268),
 ('like', 0.9981303811073303)]

In [11]:
model.wv.similarity(w1="good", w2="great")


0.9937815

In [12]:
vector = model.wv['computer']
vector

array([-0.1321151 ,  0.26139477,  0.02344489,  0.02013884,  0.12211651,
       -0.36286184,  0.12769471,  0.52304447, -0.21190079, -0.1339838 ,
       -0.10965713, -0.38602453, -0.03672924,  0.05580724, -0.01306243,
       -0.21030512,  0.08371829, -0.23504812,  0.00543527, -0.51343125,
        0.1642728 ,  0.04787692,  0.17775221, -0.0913579 , -0.08565787,
        0.04591036, -0.17014422, -0.07498486, -0.20780666,  0.01926671,
        0.27008662,  0.07481878,  0.09133296, -0.14739335, -0.0351727 ,
        0.23752563,  0.04132713, -0.2194138 , -0.1877265 , -0.4448602 ,
       -0.05550065, -0.16501512, -0.08559819,  0.01589165,  0.19367823,
       -0.13315625, -0.26763365, -0.00228771,  0.08594432,  0.2402044 ,
        0.10317809, -0.17094825, -0.13325942, -0.04172974, -0.24377188,
        0.12396557,  0.14149739, -0.04867545, -0.21981972, -0.00239788,
        0.03532778,  0.15495205, -0.17486745,  0.09516638, -0.28033414,
        0.21229498,  0.07279653,  0.14425418, -0.2567069 ,  0.26

In [13]:
vector.shape

(100,)

#Fasttext

###skipgram
By default fasttext uses skip gram

In [14]:
!pip install fasttext



In [15]:
def preprocess_fasttext(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [16]:
fasttext_corpus = df['headline'].apply(preprocess_fasttext)

In [17]:
fasttext_corpus[1]

'dem rep totally nail congress fall short gender racial equality'

In [18]:

fast_corpus_text = "".join(fasttext_corpus)

In [None]:
with open("/content/drive/MyDrive/NLP 7th sem/dataset/fast_corpus_text.txt" , 'w') as fast_corpus_file:
  fast_corpus_file.write(fast_corpus_text)

In [19]:
import fasttext

fastmodel = fasttext.train_unsupervised("/content/drive/MyDrive/NLP 7th sem/dataset/fast_corpus_text.txt")

In [20]:
fastmodel.get_nearest_neighbors("science")

[(0.9996328949928284, 'audience'),
 (0.9994791746139526, 'fence'),
 (0.999470591545105, 'experience'),
 (0.9993274211883545, 'enhance'),
 (0.9991028308868408, 'chance'),
 (0.9990776777267456, 'advance'),
 (0.9990459084510803, 'reference'),
 (0.9990207552909851, 'ounce'),
 (0.9989998936653137, 'balance'),
 (0.9989815354347229, 'dance')]

In [21]:
fastmodel.get_word_vector("science")

array([-0.07292272,  0.02354077,  0.18547443,  0.02405453, -0.09048659,
       -0.09180829, -0.2472559 ,  0.07412647, -0.09810843,  0.07890358,
       -0.0893105 ,  0.15991105, -0.23472032,  0.13739204, -0.2122095 ,
        0.14127624, -0.02770304,  0.0027014 , -0.32371104, -0.20016691,
        0.04124147,  0.24766947, -0.25216553, -0.10965463,  0.09074289,
        0.13983034,  0.17381883, -0.17223184,  0.08798582,  0.01220096,
       -0.12760983,  0.03996927,  0.10744526, -0.16854136,  0.07105999,
       -0.06316182,  0.06149823,  0.00726604, -0.20382604, -0.1211672 ,
        0.17385168, -0.11767498,  0.10752564,  0.2698556 ,  0.13161547,
       -0.12371317,  0.3456494 ,  0.03264324, -0.09154684,  0.12761827,
       -0.04987136,  0.11286843,  0.22366503,  0.04593937, -0.15154985,
       -0.05879388,  0.0106386 , -0.20200875,  0.10353652,  0.04513806,
        0.00366625,  0.14153092, -0.02950748,  0.16537812,  0.04029742,
        0.06589752, -0.14529446, -0.00367883,  0.04718598, -0.06

In [22]:
fastmodel.get_word_vector("science").shape


(100,)

###Using CBOW
continous bag of words

In [23]:
fastmodelCbow = fasttext.train_unsupervised("/content/drive/MyDrive/NLP 7th sem/dataset/fast_corpus_text.txt","cbow")

In [24]:
fastmodelCbow.get_nearest_neighbors("science")

[(0.9999983906745911, 'silence'),
 (0.9999977946281433, 'experience'),
 (0.9999977350234985, 'absence'),
 (0.999997615814209, 'audience'),
 (0.9999972581863403, 'reference'),
 (0.9999972581863403, 'balance'),
 (0.9999970197677612, 'lawrence'),
 (0.9999969601631165, 'advance'),
 (0.9999969601631165, 'independence'),
 (0.9999968409538269, 'consequence')]

#Glove


In [25]:
import numpy as np

embedding_index = {}
with open('/content/drive/MyDrive/NLP 7th sem/dataset/glove/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [26]:
embedding_index['good'].shape


(100,)

In [27]:
import tensorflow as tf
import keras

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(text_corpus)


In [28]:
embedding_matrix = np.zeros((len(tokenizer.word_index)+1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [29]:
embedding_matrix.shape

(20975, 100)