# Prepare

In [4]:
import numpy as np 
import re
import os
from tqdm import tqdm_notebook, tqdm
import itertools
import pickle
from collections import Counter
import string
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from glove import Corpus, Glove
from sklearn.feature_extraction.text import CountVectorizer
import io

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Cornell Movie--Dialogs Corpus

## Extract text 

( Along with ids for future usage)

In [0]:
!unzip cornell_movie_dialogs_corpus.zip

Archive:  cornell_movie_dialogs_corpus.zip
   creating: cornell movie-dialogs corpus/
  inflating: cornell movie-dialogs corpus/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/cornell movie-dialogs corpus/
  inflating: __MACOSX/cornell movie-dialogs corpus/._.DS_Store  
  inflating: cornell movie-dialogs corpus/chameleons.pdf  
  inflating: __MACOSX/cornell movie-dialogs corpus/._chameleons.pdf  
  inflating: cornell movie-dialogs corpus/movie_characters_metadata.txt  
  inflating: cornell movie-dialogs corpus/movie_conversations.txt  
  inflating: cornell movie-dialogs corpus/movie_lines.txt  
  inflating: cornell movie-dialogs corpus/movie_titles_metadata.txt  
  inflating: cornell movie-dialogs corpus/raw_script_urls.txt  
  inflating: cornell movie-dialogs corpus/README.txt  
  inflating: __MACOSX/cornell movie-dialogs corpus/._README.txt  


In [0]:
cornell_movie_dialogs = os.path.join(os.getcwd(), 'cornell movie-dialogs corpus')
dialog_files = [os.path.join(cornell_movie_dialogs, path) for path in os.listdir(cornell_movie_dialogs)]
dialog_files

['/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/movie_conversations.txt',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/movie_titles_metadata.txt',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/movie_lines.txt',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/movie_characters_metadata.txt',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/raw_script_urls.txt',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/chameleons.pdf',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/.DS_Store',
 '/content/drive/My Drive/Cinnamon/a6/cornell movie-dialogs corpus/README.txt']

In [0]:
with open('./cornell movie-dialogs corpus/movie_lines.txt', 'r', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()
len(lines)

304713

In [0]:
id2text = {}

for line in tqdm_notebook(lines):
    terms = line.split('+++$+++')
    ids = terms[0].strip()
    text = terms[-1].strip()
    id2text[ids] = text

HBox(children=(IntProgress(value=0, max=304713), HTML(value='')))




In [0]:
with open('./cornell movie-dialogs corpus/movie_conversations.txt', 'r', encoding='utf-8', errors='ignore') as f:
    #lines = f.read().split('\n')
    lines = f.readlines()
len(lines)

83097

In [0]:
conversations = []

for line in tqdm_notebook(lines[::-1]):
    terms = line.split('+++$+++')
    ids = literal_eval(terms[-1].strip())
    conversations.append([id2text[id_] for id_ in ids])

HBox(children=(IntProgress(value=0, max=83097), HTML(value='')))




## Build corpus

In [0]:
dialog = list(itertools.chain.from_iterable(conversations))
pickle.dump(dialog, open('./data/dialog.pkl', 'wb'))

# Wikidump

## Download and extract

In [0]:
! wget https://dumps.wikimedia.org/enwiki/20200201/enwiki-20200201-pages-articles-multistream25.xml-p35452817p36952817.bz2
! python WikiExtractor.py enwiki-20200201-pages-articles-multistream25.xml-p35452817p36952817.bz2 --processes 4 -o ./data2/ --json

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO: 36898268	The Cardboard Village
INFO: 36898279	SMS Albatross (1871)
INFO: 36898280	Norwegian Centre for Research Data
INFO: 36898281	Self-medication (disambiguation)
INFO: 36898293	Kyrtis MacKenzie
INFO: 36898294	2012 Shanghai Challenger – Doubles
INFO: 36898300	Annanur railway station
INFO: 36898307	Gorgie City Farm
INFO: 36898315	Mr. Morgan's Last Love
INFO: 36898325	Global Partnership Against the Spread of Weapons and Materials of Mass Destruction
INFO: 36898331	Heavenly Body (film)
INFO: 36898344	Elin Rosseland
INFO: 36898351	Meir Javedanfar
INFO: 36898361	Marcel Di Domenico
INFO: 36898363	Sergey Punko
INFO: 36898367	1912 All England Badminton Championships
INFO: 36898372	South African women's cricket team in the Netherlands in 2007
INFO: 36898379	Jayati Bhatia
INFO: 36898393	1913 All England Badminton Championships
INFO: 36898414	Koenders
INFO: 36898395	Eighteen History Books of Jin
INFO: 36898425	Peter Strickla

In [0]:
wiki_data = []

wiki_folder = os.path.join(os.getcwd(), 'wiki_data')

for folder_path in tqdm_notebook([os.path.join(wiki_folder, folder_name) for folder_name in os.listdir(wiki_folder)]):
    for text_file in [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path)]:
        with open(text_file, 'r') as f:
            contents = f.read()
            contents = re.sub('(\s)+', r'\1', contents)
            contents = contents.split('\n')
            for content in contents:
                try:
                    content = eval(content)
                except:
                    continue
                text =  ' '. join(content['text'].split('\n'))
                wiki_data.append(text)

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




## Build corpus

In [0]:
pickle.dump(wiki_data, open('./data/wiki.pkl', 'wb'))

# Preprocess 

In [0]:
dialog = pickle.load(open('./data/dialog.pkl', 'rb'))
wiki = pickle.load(open('./data/wiki1.pkl', 'rb'))

wiki[0]

'Pritam Singh (gymnast)  Pritam Singh (born 1924) was an Indian gymnast. He competed in seven events at the 1956 Summer Olympics. '

In [0]:
wiki.extend(dialog)
len(wiki)

517824

In [0]:
def preprocess(documents, min_tf=3):
    
    def get_text(docs):
        print('Getting started...')
        text = ' '.join(docs)
        return text.lower()               

    def denoise(text, min_tf):
        print('Denoising...')
        count = Counter(text)
        noise_chars = [char for char in count.keys() if count[char]<min_tf]
        print('Some noise characters', noise_chars[:5])
        noise_chars = ''.join(noise_chars)
        text = text.translate(str.maketrans('', '', noise_chars))          # Remove noisy characters
        return text

    def tokenize(text):
        sentences = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)             # sentence tokenize
        sentences = [re.findall('[a-zA-z]+', sentence) for sentence in tqdm(sentences, desc='Tokenizing')]        
        return sentences 

    def lemmatize(tokens):
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tqdm(tokens, desc='Lematizing')]          
        return tokens
    

    corpus = get_text(documents)        
    corpus = denoise(corpus, min_tf)
    words = tokenize(corpus)
    words = lemmatize(words)

    return words

words = preprocess(wiki)

In [0]:
pickle.dump(words, open('words.pkl', 'wb'))

In [0]:
words = pickle.load(open('words.pkl', 'rb'))

# Implement word embedding

## Bag of words

In [0]:
sentences = [' '.join(word) for word in words]

In [9]:
model = CountVectorizer()
model.fit(sentences)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [13]:
model.transform(['i am twelve years old'])

<1x591916 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [0]:
pickle.dump(model, open('./model/Bag_of_words.pkl', 'wb'))

## Skip Gram

### Training

In [0]:
model = Word2Vec(words, size=150, window=10, min_count=2, workers=4, sg=0)
model.wv.save('./model/skip_gram.model')

In [0]:
#model.save('./model/skip_gram.bin')
model = Word2Vec.load('./model/skip_gram.bin')

In [0]:
out_v = io.open('./tensorboard/vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('./tensorboard/meta.tsv', 'w', encoding='utf-8')

for word in tqdm(model.wv.vocab):
  vec = model.wv.get_vector(word)
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  
out_v.close()
out_m.close()

### Tensorboard visualization

![Picture1](https://user-images.githubusercontent.com/52401767/75088010-e2a3a280-5579-11ea-8945-9654fe9ca7ed.png)

'beautiful' with 5 nearest neighbors (cosine similarity)

## Glove

### Training

In [0]:
corpus = Corpus() 
corpus.fit(words, window=10)

In [0]:
glove = Glove(no_components=150, learning_rate=0.05)
 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('./model/glove.model')

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [0]:
out_v = io.open('./tensorboard/glove_vec.tsv', 'w', encoding='utf-8')
out_m = io.open('./tensorboard/glove_word.tsv', 'w', encoding='utf-8')

for word, index in tqdm(glove.dictionary.items()):
  vec = glove.word_vectors[index]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  
out_v.close()
out_m.close()

100%|██████████| 598707/598707 [01:43<00:00, 5808.94it/s]


### Tensorboard visualization
![Picture1](https://user-images.githubusercontent.com/52401767/75092441-14ccf880-55aa-11ea-9992-cf02a1464c9b.png)

'beautiful' with 10 nearest neighbors (cosine similarity)
