In [6]:
#Word2Vec and FastText Word Embedding with Gensim

import gzip
import gensim
import nltk

In [22]:
data_file="reviews_data.gz"

with gzip.open ('reviews_data.gz', 'r') as f:
    review_data=[ line for i,line in enumerate (f)]
    
       


In [23]:
#sample dataset
review_data[0]

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [41]:
#Tokenize

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

    
# Clean and preprocess the dataset

def preprocess(dataset):
    #lowercase and tokenize
    review_data_tokenize= [word_tokenize(str(review).lower()) for review in review_data]
    
    #remove stopwords
    sr = stopwords.words('english')
    review_data_cleaned= [token for token in review_data_tokenize if not token in sr]
    
    
    # Stemming and Lemmatization
    stemmer=PorterStemmer()
    review_data_stems=[stemmer.stem(token) for token in review_data_cleaned ]
    
    lemmatizer=WordNetLemmatizer()
    review_data_processed=[lemmatizer.lematize(token) for token in review_data_cleaned ]
    
    
    return review_data_processed


# Clean and preprocess the dataset



In [32]:
# Synonyms, Antonyms

from nltk.corpus import wordnet


synonyms=[]
antonyms=[]

for syn in wordnet.synsets('tall'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
        
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())
    
    


print(synonyms)
print(antonyms)

['tall', 'tall', 'grandiloquent', 'magniloquent', 'tall', 'tall', 'improbable', 'marvelous', 'marvellous', 'tall']
['short']


In [40]:
# POS and NER
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

sentence = "Mark and John are working at Google."
 
print(ne_chunk(pos_tag(word_tokenize(sentence))))



(S
  (PERSON Mark/NNP)
  and/CC
  (PERSON John/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION Google/NNP)
  ./.)


In [43]:
def read_input(input_file):
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f):
            yield gensim.utils.simple_preprocess (line)
            #yield preprocess(line)


documents = list (read_input (data_file))


In [44]:
#Training Word2Vec
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

(303494708, 415193550)

In [45]:
# Find Similar words
w1 = "dirty"
model.wv.most_similar (positive=w1)

[('filthy', 0.8731397390365601),
 ('unclean', 0.7831881642341614),
 ('stained', 0.7769122123718262),
 ('dusty', 0.7751280665397644),
 ('grubby', 0.760492742061615),
 ('smelly', 0.758076548576355),
 ('dingy', 0.7364238500595093),
 ('mouldy', 0.7207521200180054),
 ('gross', 0.7180871367454529),
 ('soiled', 0.7178699970245361)]

In [46]:
#Words similarity
model.wv.similarity(w1="dirty",w2="smelly")

0.7580765030398822

In [47]:
# odd one out 
model.wv.doesnt_match(["cat","dog","france"])

'france'

In [51]:
#Word2Vec Mathematics

def A_is_to_B_as_C_is_to(a, b, c, topn=1):
    a, b, c = map(lambda x:x if type(x) == list else [x], (a, b, c))
    res = model.most_similar(positive=b + c, negative=a, topn=topn)
    if len(res):
        if topn == 1:
            return res[0][0]
        return [x[0] for x in res]
    return None

A_is_to_B_as_C_is_to('man', 'woman', 'king')


  """


'queen'

In [None]:
#FastText

from gensim.models import FastText
model_ft = FastText(documents, size=100, window=5, min_count=5, workers=4,sg=1)




In [56]:
#fast text model can give result for non eistence word in the vocabulary as well
model_ft.wv.most_similar("Gastroenteritis")

[('catastrophy', 0.7369320392608643),
 ('desastroso', 0.7364808320999146),
 ('granduer', 0.7288857698440552),
 ('struttura', 0.7264410257339478),
 ('katastrophe', 0.7233595848083496),
 ('catastrophe', 0.7171571254730225),
 ('xixeme', 0.7164624333381653),
 ('schizophrenic', 0.7157965898513794),
 ('bohme', 0.7148301601409912),
 ('beind', 0.7146252393722534)]

[('hotell', 0.9006769061088562),
 ('hotelhas', 0.8679174184799194),
 ('hotelthis', 0.8461465239524841),
 ('hotelbut', 0.8390347957611084),
 ('hotelwas', 0.8342684507369995),
 ('hotelwith', 0.8297032117843628),
 ('hotelat', 0.8279556035995483),
 ('shotel', 0.8165179491043091),
 ('hotelgood', 0.8164716958999634),
 ('hotelwould', 0.8118799924850464)]