In [1]:
import numpy as np
import pandas as pd
import re
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader, random_split
from collections import Counter, OrderedDict
from tqdm import tqdm
import matplotlib.pyplot as plt
#from nltk.tokenize import word_tokenize

torch.manual_seed(1)

<torch._C.Generator at 0x7f3f0ad4e5b8>

# **Define some parameters**

In [10]:
WINDOW_SIZE = 2
EMBEDDING_SIZE = 50
BATCH_SIZE = 256
EPOCH=1

# **Define classes and functions**

**Note: I don't split dataset into training and validation sets because the cuda memory isn't enough to hold the training. In order to get the best epoch, I check the performance with different iterations (see result in report).** 

In [3]:
# define class Text2Dataset 
class Text2DataSet(Dataset):

    def __init__(self, data_file_path, content_type='reviews', window_size=2):

        # read files and lower the case
        if content_type=='reviews':
          f = pd.read_csv(data_file_path)
          review = f['Review'].tolist()
          text = ' '.join(review).lower()
        elif content_type=='scifi': 
          with open(data_file_path,'r',encoding='utf-8') as f:
            text = f.read().lower()
        else:
          print('Define correct content_type to select the file.')
        
        # remove numbers
        text = re.sub(r'[0-9]+','',text)
        
        # remove punctuation
        table = str.maketrans('', '', string.punctuation)
        words = [w.translate(table) for w in text.split()]

        self.count_word = Counter(words)
        self.vocab = set(words)
        self.vocab_size = len(self.vocab)
        self.word_to_idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx_to_word = list(self.word_to_idx.keys()) 
        self.window_size = window_size

        self.context_target =  [([words[i-(j+1)] for j in range(window_size)] +\
                                 [words[i+(j+1)] for j in range(window_size)],
                                words[i])
                                for i in range(window_size, len(words)-window_size)]

    def __getitem__(self, idx):
        context = torch.tensor([self.word_to_idx[w] for w in self.context_target[idx][0]])
        target = torch.tensor([self.word_to_idx[self.context_target[idx][1]]])
        return context, target

    def __len__(self):
        return len(self.context_target)
      


# define class of CBOW
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
    
    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1)     # sum of input tensors
        out = self.linear(embeds)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


# define function to train the cbow model
def train_cbow(data, vocab_size, vocab_to_idx, embedding_size, window_size, epoch=EPOCH):
    model = CBOW(vocab_size,embedding_size, window_size)
    
    nll_loss = nn.NLLLoss()    # negative log loss
    #optimizer = optim.SGD(cbow.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    cuda_available = torch.cuda.is_available()
    data_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

    for epo in range(epoch):
        total_loss = 0
        for context, target in tqdm(data_loader):
          if context.size()[0] != BATCH_SIZE:
            continue
          # continue training with last several instances if the number of them less than batch size
          if cuda_available:
            context = context.cuda()
            target = target.squeeze(1).cuda()
            model = model.cuda()          

          model.zero_grad()
          log_prob = model(context)
          loss = nll_loss(log_prob, target)
          loss.backward()
          optimizer.step()
          total_loss += loss    
        average_loss = total_loss / len(data_loader)  

        print("{}/{} average loss {:.4f}".format(epo+1, epoch, average_loss))
    
    return model


# search words with different frequencies
def search_words_in_different_frequencies(words_dict, high_bar=20000, 
                                  low_bar=50, median_bar=[1000,5000]):
  high = []
  median = []
  rare = []
  for word, count in words_dict.items():
    if count > high_bar:
      high.append(word)
    elif count > median_bar[0] and count < median_bar[1]:
      median.append(word)
    elif count < low_bar:
      rare.append(word)
  return high, median, rare


# K-nearest neighbor to get top similar words of certain word
def get_closest_word(model, idx_to_word, idx, topn=5): 
  embed_matrix = model.embeddings.weight.detach().cpu().numpy()
  dists = np.dot((embed_matrix - embed_matrix[idx])**2, np.ones(embed_matrix.shape[1]))
  idxs = np.argsort(dists)[1:topn+1]
  #print('The {} nearest neighbour of {} are: '.format(str(topn), idx_to_word[idx]))
  closest_words = []
  for i in idxs:
    closest_words.append(idx_to_word[i])
  print(closest_words)
  return closest_words




# ***Corpus: tripadvisor hotel reviews***

---


# **Preprocess and load data of tripadvisor hotel reviews**
**Preprocess and load data**

In [13]:
# load dataset hotel reviews
reviews = Text2DataSet('tripadvisor_hotel_reviews.csv',content_type='reviews',window_size=WINDOW_SIZE)

**get variables needed for model training**

In [14]:
vocab_size_rev = reviews.vocab_size
word_to_idx_rev = reviews.word_to_idx
idx_to_word_rev = reviews.idx_to_word
print(vocab_size_rev)

76101


# **Train model on the hotel reviews**

In [15]:
# train model on dataset of hotel reviews, 10 epochs
model_rev = train_cbow(reviews, vocab_size_rev, word_to_idx_rev, EMBEDDING_SIZE, WINDOW_SIZE, epoch=EPOCH)

100%|██████████| 8204/8204 [01:27<00:00, 93.82it/s]

1/1 average loss 7.5667





# **Search words in different frequencies in corpus hotel reviews**

In [16]:
# get the statistics of word counts
rev_words_dict = reviews.count_word
pd.DataFrame(rev_words_dict.values()).describe()

Unnamed: 0,0
count,76101.0
mean,27.596944
std,372.244207
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,48864.0


In [None]:
# looking for words with different frequencies
high_rev, low_rev, rare_rev = search_words_in_different_frequencies(rev_words_dict,high_bar=15000 ,
                                                          median_bar=[300,500], low_bar=5)
print("high frequency:", high_rev)
print("low frequency:", low_rev)
print("rare:", rare_rev)

high frequency: ['hotel', 'good', 'stay', 'room', 'not', 'great', 'staff', 'nt']
low frequency: ['anniversary', 'valet', 'talking', 'advantage', 'longer', 'member', 'chain', 'showed', 'oh', 'mixed', 'calls', 'asking', 'provide', 'furnished', 'concerned', 'neighborhood', 'picky', 'saying', 'champagne', 'chocolate', 'trips', 'screen', 'rates', 'sound', 'filled', 'w', 'driver', 'downside', 'picked', 'broken', 'face', 'play', 'hair', 'sleeping', 'wireless', 'comments', 'language', 'company', 'lights', 'hit', 'online', 'word', 'design', 'opinion', 'welcoming', 'art', 'chance', 'interesting', 'options', 'museum', 'experienced', 'exceptional', 'sheets', 'foot', 'bell', 'travelers', 'slept', 'july', 'pleasantly', 'reserved', 'hall', 'waited', 'noticed', 'dollar', 'credit', 'checkout', 'gift', 'earlier', 'man', 'didnçé', 'bag', 'gets', 'sent', 'sunday', 'apparently', 'downstairs', 'closer', 'unfortunately', 'lady', 'aware', 'treated', 'lucky', 'entrance', 'nearly', 'finding', 'thanks', 'spendin

# **Predict top 5 closest words by model trained on hotel reviews**

In [17]:
# top 5 closest words predicted by model trained on dataset of hotel reviews
chosen_rev_words = ['hotel','good','stay','hair','exceptional','imagine','emotions','overpaying','shocks']
for word in chosen_rev_words:
  print("{}:{}".format(word, rev_words_dict[word]))
  get_closest_word(model_rev, idx_to_word_rev, word_to_idx_rev[word])
  print('\n')

hotel:48864
['resort', 'place', 'property', 'thought', 'rooms']


good:16986
['great', 'fine', 'excellent', 'better', 'average']


stay:15158
['staying', 'return', 'come', 'actually', 'visit']


hair:339
['specific', 'dark', 'missing', 'furniture', 'fact']


exceptional:380
['outstanding', 'excellent', 'personal', 'nice', 'superb']


imagine:355
['believe', 'afford', 'remember', 'comment', 'say']


emotions:3
['lobbyentertainmentfood', 'disapointment', 'rolls', 'clips', 'ap']


overpaying:2
['continually', 'refurbished', 'whats', 'goodnatured', 'gluten']


shocks:3
['washrooms', 'walls', 'restraunt', 'permanently', 'border']




# ***Corpus: Scifi***

---


# **Preprocess and load the data of scifi**
**Preprocess and load data**

In [4]:
# load scifi dataset
scifi = Text2DataSet('scifi.txt',content_type='scifi',window_size=WINDOW_SIZE)

**get variables needed for model training**

In [5]:
vocab_size_scifi = scifi.vocab_size
word_to_idx_scifi = scifi.word_to_idx
idx_to_word_scifi = scifi.idx_to_word
print(vocab_size_scifi)

200808


# **Train model on scifi**

In [11]:
# train model on scifi dataset
model_scifi = train_cbow(scifi, vocab_size_scifi, word_to_idx_scifi, EMBEDDING_SIZE, WINDOW_SIZE)

100%|██████████| 60110/60110 [25:32<00:00, 39.21it/s]

1/1 average loss 6.7511





# **Search words in different frequencies in corpus scifi**

In [7]:
# get the statistics of word counts
scifi_words_dict = scifi.count_word
pd.DataFrame(scifi_words_dict.values()).describe()

Unnamed: 0,0
count,200808.0
mean,76.630508
std,3077.862353
min,1.0
25%,1.0
50%,1.0
75%,5.0
max,883836.0


In [8]:
# looking for words with different frequencies
high_sci, low_sci, rare_sci = search_words_in_different_frequencies(scifi_words_dict,high_bar= 10000,
                                                          median_bar=[500,800], low_bar=5)
print("high frequency:", high_sci)
print("low frequency:", low_sci)
print("rare:", rare_sci)

high frequency: ['', 'all', 'new', 'and', 'if', 'is', 'by', 'no', 'for', 'as', 'at', 'in', 'us', 'of', 'this', 'are', 'any', 'to', 'a', 'with', 'the', 'i', 'was', 'after', 'much', 'thought', 'because', 'its', 'on', 'it', 'will', 'be', 'that', 'just', 'had', 'into', 'we', 'have', 'very', 'people', 'which', 'them', 'their', 'now', 'one', 'our', 'why', 'you', 'say', 'would', 'not', 'but', 'get', 'from', 'or', 'think', 'some', 'could', 'then', 'so', 'has', 'dont', 'than', 'other', 'like', 'own', 'way', 'what', 'make', 'while', 'still', 'must', 'an', 'who', 'when', 'they', 'do', 'over', 'his', 'he', 'without', 'around', 'good', 'old', 'there', 'only', 'thats', 'want', 'out', 'little', 'been', 'last', 'nothing', 'more', 'time', 'too', 'years', 'go', 'look', 'back', 'your', 'come', 'him', 'eyes', 'looked', 'made', 'away', 'man', 'up', 'face', 'three', 'men', 'were', 'knew', 'room', 'said', 'voice', 'off', 'before', 'went', 'head', 'turned', 'came', 'again', 'im', 'these', 'first', 'asked', 'w

# **Predict top 5 closest words by model trained on scifi**

In [12]:
# top 5 closest words predicted by model trained on scifi
chosen_scifi_words = ['hand','good','know','cup','amazing','aid','timetables','thankless','trys']
for word in chosen_scifi_words:
  print("{}:{}".format(word, scifi_words_dict[word]))
  get_closest_word(model_scifi, idx_to_word_scifi, word_to_idx_scifi[word])
  print('\n')

hand:10996
['shoulder', 'hands', 'finger', 'heart', 'chin']


good:15435
['fine', 'bad', 'great', 'nice', 'small']


know:28539
['mean', 'understand', 'think', 'say', 'remember']


cup:596
['pack', 'stream', 'bag', 'bottle', 'flare']


amazing:528
['impossible', 'possible', 'imperfect', 'rapid', 'abrupt']


aid:679
['violence', 'victory', 'satisfaction', 'fear', 'fugue']


timetables:1
['ally', 'absurdly', 'intending', 'rousseau', 'tickets']


thankless:4
['ecological', 'quantum', 'highpower', 'screwing', 'antiaircraft']


trys:2
['gibson', 'kaifri', 'everything', 'vix', 'resigned']




# **Looking for common words in the two corpus**

In [18]:
# looking for common words in scifi and reviews
#common_words = []
#for word in set(high_rev).intersection(set(high_sci)):
  #common_words.append(word)
#print(common_words)

word_dic_rev = set(reviews.count_word)
word_dic_scifi = set(scifi.count_word)
print("Common words and their frequency in scifi and tripadvisor hotel reviews:")
common_words =[]
for word in word_dic_rev.intersection(word_dic_scifi):
  if word!='' and reviews.count_word[word]>5000 and scifi.count_word[word]>5000:
    common_words.append([word, reviews.count_word[word], scifi.count_word[word]])
print(common_words)


Common words and their frequency in scifi and tripadvisor hotel reviews:
[['really', 7765, 7108], ['room', 34324, 10662], ['did', 13895, 22897], ['small', 6527, 8346], ['night', 9729, 6149], ['not', 31526, 70746], ['day', 9542, 8679], ['like', 8114, 35710], ['good', 16986, 15435], ['no', 11620, 50173], ['place', 7693, 8730], ['great', 21094, 8308], ['time', 9824, 31971], ['people', 6762, 13190], ['just', 12592, 27961], ['little', 6242, 17199], ['got', 6186, 17900]]


# **Predict top 3 closest words for certain common words by the two models**

In [20]:
# choose common words from both files, predicted closest words by the two models
chosen_common_words = ['day','place','good']
print('----Predicted by model trained on <scifi.txt>----')
for word in chosen_common_words:
  print("{}:{}".format(word, scifi_words_dict[word]))
  get_closest_word(model_scifi, idx_to_word_scifi, word_to_idx_scifi[word],topn=3)
  print('\n')

print('----Predicted by model trained on <tripadvisor_hotel_reviews.txt>----')
for word in chosen_common_words:
  print("{}:{}".format(word, rev_words_dict[word]))
  get_closest_word(model_rev, idx_to_word_rev, word_to_idx_rev[word], topn=3)
  print('\n')

----Predicted by model trained on <scifi.txt>----
day:8679
['time', 'night', 'world']


place:8730
['world', 'country', 'town']


good:15435
['fine', 'bad', 'great']


----Predicted by model trained on <tripadvisor_hotel_reviews.txt>----
day:9542
['everyday', 'week', 'days']


place:7693
['hotel', 'resort', 'overall']


good:16986
['great', 'fine', 'excellent']


