<a href="https://colab.research.google.com/github/JishnuJayaraj/ML/blob/master/NLP/Word2vec/Word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word2Vec

[link text](https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial)

## Cleaning samples

In [None]:
clean_txt = []
for w in range(len(df.text)):
    desc = df['text'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags  change this!!!
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    clean_txt.append(desc)
df['clean'] = clean_txt
df.head()

# tokenization
corpus = []
for col in df.clean:
    word_list = col.split(" ")
    corpus.append(word_list)
#show first value
corpus[0:1]


## -----------------------------------------------------------------------
# http://ethen8181.github.io/machine-learning/deep_learning/word2vec/word2vec_detailed.html


def export_unigrams(unigram_path, texts, stop_words):
    """
    Preprocessed the raw text and export it to a .txt file,
    where each line is one document, for what sort of preprocessing
    is done, please refer to the `normalize_text` function

    Parameters
    ----------
    unigram_path : str
        output file path of the preprocessed unigram text.

    texts : iterable
        iterable can be simply a list, but for larger corpora,
        consider an iterable that streams the sentences directly from
        disk/network using Gensim's Linsentence or something along
        those line.

    stop_words : set
        stopword set that will be excluded from the corpus.
    """
    with open(unigram_path, 'w', encoding='utf_8') as f:
        for text in texts:
            cleaned_text = normalize_text(text, stop_words)
            f.write(cleaned_text + '\n')


def normalize_text(text, stop_words):
    # remove special characters\whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A)

    # lower case & tokenize text
    tokens = re.split(r'\s+', text.lower().strip())

    # filter stopwords out of text &
    # re-create text from filtered tokens
    cleaned_text = ' '.join(token for token in tokens if token not in stop_words)
    return cleaned_text

# a set of stopwords built-in to various packages
# we can always expand this set for the
# problem that we are working on, here we also included
# python built-in string punctuation mark
STOPWORDS = set(stopwords.words('english')) | set(punctuation) | set(ENGLISH_STOP_WORDS)

# create a directory called 'model' to
# store all outputs in later section
MODEL_DIR = 'model'
if not os.path.isdir(MODEL_DIR):
    os.mkdir(MODEL_DIR)

UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
if not os.path.exists(UNIGRAM_PATH):
    start = time()
    export_unigrams(UNIGRAM_PATH, texts=newsgroups_train.data, stop_words=STOPWORDS)
    elapse = time() - start
    print('text preprocessing, elapse', elapse)

PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
if os.path.exists(PHRASE_MODEL_CHECKPOINT):
    phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
else:
    # use LineSentence to stream text as oppose to
    # loading it all into memory
    unigram_sentences = LineSentence(UNIGRAM_PATH)
    start = time()
    phrase_model = Phrases(unigram_sentences)
    elapse = time() - start
    print('training phrase model, elapse', elapse)
    phrase_model.save(PHRASE_MODEL_CHECKPOINT)

def export_bigrams(unigram_path, bigram_path, phrase_model):
    """
    Use the learned phrase model to create (potential) bigrams,
    and output the text that contains bigrams to disk

    Parameters
    ----------
    unigram_path : str
        input file path of the preprocessed unigram text

    bigram_path : str
        output file path of the transformed bigram text

    phrase_model : gensim's Phrase model object

    References
    ----------
    Gensim Phrase Detection
    - https://radimrehurek.com/gensim/models/phrases.html
    """

    # after training the Phrase model, create a performant
    # Phraser object to transform any sentence (list of
    # token strings) and glue unigrams together into bigrams
    phraser = Phraser(phrase_model)
    with open(bigram_path, 'w') as fout, open(unigram_path) as fin:
        for text in fin:
            unigram = text.split()
            bigram = phraser[unigram]
            bigram_sentence = ' '.join(bigram)
            fout.write(bigram_sentence + '\n')

BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
if not os.path.exists(BIGRAM_PATH):
    start = time()
    export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)
    elapse = time() - start
    print('converting words to phrases, elapse', elapse)

word2vec = Word2Vec(corpus_file=BIGRAM_PATH, workers=cpu_count())

## load data from drive

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

# orig = pd.read_pickle('/content/drive/My Drive/RokinData/newOnly.pkl')
orig = pd.read_json('/content/drive/My Drive/RokinData/ToBeCleaned.json.gz')

df = orig.sample(50000)
del orig

print('length of df is :', len(df))
df.isnull().sum()

## creating custom .txt files from it

In [None]:
# Sort df based on year-month and save to different txt files

dfs = dict(tuple(df.groupby([df['date'].dt.year,df['date'].dt.month])))
len(dfs[(2020, 6)])


# --------------------------> select month & Year here <------------------------
a = dfs[(2020, 6)]

# a['text'].replace('\s+', ' ', regex=True, inplace=True) # remove extra whitespace
# a['text'].replace('\n',' ', regex=True, inplace=True) # remove \n in text
a['text'].replace(r'\s+|\n', ' ', regex=True, inplace=True) 

# a['text'].to_csv(r'/content/drive/My Drive/RokinData/word2vec/3.txt', header=None, index=None, sep=' ', mode='a')
with open('/content/drive/My Drive/RokinData/word2vec/output.txt', 'w') as f:
    f.write(a['text'].str.cat(sep='\n'))

## iterator for folder

In [None]:
# go thr all files in folder
import os

class WordTrainer(object):
   def __init__(self, dir_name):
      self.dir_name = dir_name
   def __iter__(self):
      for idx,file_name in enumerate(os.listdir(self.dir_name)):   # go thr each files
        for idxx,line in enumerate(open(os.path.join(self.dir_name, file_name),'r')):  # open each files
            # words = [word.lower() for word in line.split()]
            # yield words
            tokenized_list = simple_preprocess(line, deacc=True)

articles1 = WordTrainer('/content/drive/My Drive/RokinData/word2vec')
model = Word2Vec(articles, min_count=1,size= 50,workers=3, window =3, sg = 1)
#  word_vector_model = gensim.models.Word2Vec(articles1, size=100, window=8, min_count=5)

## iterator for file

In [None]:
# go thr give file
import gensim
from gensim.utils import simple_preprocess

class SentenceIterator: 
    def __init__(self, filepath): 
        self.filepath = filepath 

    def __iter__(self): 
        for line in open(self.filepath): 
            # yield line.split()     # add code here to make list of list
            yield simple_preprocess(line, deacc=True)

# define model
model = gensim.models.Word2Vec(size=100)

sentences = SentenceIterator('/content/drive/My Drive/RokinData/word2vec/sample/1.txt') 
model.build_vocab(sentences)
model.train(sentences,epochs=3,total_examples=model.corpus_count)

In [None]:
model.wv.most_similar(positive='study')

list1 = model.wv.vocab
model.wv.vectors.shape


In [None]:
# train 2nd articls set
sentences2 = SentenceIterator('/content/drive/My Drive/RokinData/word2vec/sample/2.txt')

model.build_vocab(sentences2, update=True)

model.train(sentences2,epochs=3,total_examples=model.corpus_count)

In [None]:
list2 = model.wv.vocab
model.wv.vectors.shape

In [None]:
# difference of 2 vocab
# def difference(list1,list2):
#     return (list(set(list1) - set(list2)))

# difference(list1,list2)
# list(set(list1) - set(list2))
value = { k : list2[k] for k in set(list2) - set(list1) }

In [None]:
# difference of 2 vocab
# def difference(list1,list2):
#     return (list(set(list1) - set(list2)))

# difference(list1,list2)
# list(set(list1) - set(list2))
value = { k : list2[k] for k in set(list2) - set(list1) }

## Preprocessing

In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
# remove non alphabet
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['text'])
# [^a-zA-Z0-9\u00E4\u00F6\u00FC\u00C4\u00D6\u00DC\u00df]   to include geman characters
# \u00F0-\u02AF             all characters from europian languages

In [None]:
t = time()

%time txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=500, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Put the results in a DataFrame to remove missing values and duplicates:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
del df
del txt

from gensim.models.phrases import Phrases, Phraser

#As Phrases() takes a list of list of words as input:
sent = [row.split() for row in df_clean['clean']]

#Creates the relevant phrases from the list of sentences
%time phrases = Phrases(sent, min_count=30, progress_per=10000)

from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
# Transform the corpus based on the bigrams detected
sentences = phrases[sent]

# sentences[:2]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

## Training model

In [None]:
# Training Model
import multiprocessing

from gensim.models import Word2Vec
cores = multiprocessing.cpu_count() # Count the number of cores in a computer


# add seed= 12345, sg=1 for skip gram,               :: an empty model, no training yet
w2v_model = Word2Vec(min_count=20,
                     window=2,            # The maximum distance between the target word and its neighboring word
                     size=300,            # The size of the dense vector to represent each token, Bigger size values require more training data, but can lead to better (more accurate) models
                     sample=6e-5,         # lower value discards more freq ocureing words
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,         # Negative Sampling: training sample to update only a small percentage of the model's weights
                     workers=cores-1)      # How many threads to use

In [None]:
t = time()

# Vocab table :simply digesting all the words and filtering out the unique words, and doing some basic counts on them
%time w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
# train model
t = time()

%time w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

## Load/ Save model

if you don’t need the full model state any more (don’t need to continue training) then model can be discarded. use keyed vectors instead for faster usage

- It is impossible to continue training the vectors loaded from the C format because the hidden weights, vocabulary frequencies and the binary tree are missing. To continue training, you’ll need the full Word2Vec object state, as stored by save(), not just the KeyedVectors.

Example code for loading keyed vectors
```
from gensim.models import KeyedVectors

model.wv.save(path)
wv = KeyedVectors.load("model.wv", mmap='r')
vector = wv['computer']
```

Example code for loading text/bin format


```
wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False)  # C text format

wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True)  # C bin format
```

In [None]:
# -----------> saving
# w2v_model.save("/content/drive/My Drive/RokinData/models/word2vec_50k.model")


In [None]:
# ------------> loading
from gensim.models import Word2Vec
w2v_model = Word2Vec.load("/content/drive/My Drive/RokinData/models/word2vec_50k.model")
# this model can be used to continue training

# also, trained word vectors are stored in a KeyedVectors instance in model.wv:

len(w2v_model.wv.vocab)

In [None]:
#we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:
w2v_model.init_sims(replace=True)

In [None]:
word = 'covid'
if word in w2v_model.wv.vocab:
    print(word)

# getting count
vocab_obj =  w2v_model.wv.vocab["machine"]
vocab_obj.count

w2c = dict()
for item in w2v_model.wv.vocab:
    w2c[item]=w2v_model.wv.vocab[item].count
print(w2c)
# sorted on freq
w2cSorted=dict(sorted(w2c.items(), key=lambda x: x[1],reverse=True))
print(w2cSorted)

In [None]:
w2v_model.wv.most_similar(positive=["covid"])

In [None]:
w2v_model.wv.most_similar(positive=["robot"], topn = 6)

## Playing with model

In [None]:
# TRY this
word_vectors = pd.DataFrame(w2v_model.wv.vectors, index=word2vec.wv.index2word)
print('word vector dimension: ', word_vectors.shape)
word_vectors.head()



In [None]:
# list of all words known to the model in model.wv.index2word
import random
#get the key, w2v_model.wv.index2word
print(random.choice(model.wv.index2entity) 
#get vector, w2v_model.wv[w2v_model.wv.index2word]
print(model.wv[random.choice(model.wv.index2entity])

 
# Get a list of words in the vocabulary
words = model.wv.vocab.keys()
# Make a dictionary
we_dict = {word:model.wv[word] for word in words}

# get vectors, model[word]
model.wv.vectors
# list of words in the right order, with sync0
model.index2word

In [None]:
# look for words
if 'country' in my_model:
    print(my_model['country'][0:10])
else: 
    pass 

#or
try:
    print(my_model['country'][0:10])
except:
    pass

In [None]:
w2v_model.wv.most_similar(positive=["beer"])
w2v_model.wv.doesnt_match("apple microsoft samsung tesla".split())
model.wv.similarity(w1="dirty",w2="smelly")

# Keyword Extraction

In [None]:
import re
import numpy
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from numpy import *

import nltk
nltk.download('stopwords')

with open("/content/drive/My Drive/RokinData/word2vec/b.txt") as file:
    text_review = file.read()

#if you want to use Google original vectors from Google News corpora
# model = word2vec.Word2Vec.load_word2vec_format('/Users/Downloads/GoogleNews-vectors-negative300.bin', binary=True)
#if you want to use your own vector
model = Word2Vec.load("/content/drive/My Drive/RokinData/models/word2vec_50k.model")

def text_to_wordlist(text, remove_stopwords=True):
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", text)

    # 3. Convert words to lower case and split them, clean stopwords from model' vocabulary
    words = review_text.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    return (meaningful_words)


# Function to get feature vec of words
def get_feature_vec(words, model):
    # Index2word is a list that contains the names of the words in
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    clean_text = []
    # vocabulary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            clean_text.append(model[word])

    return clean_text


# bag of word list without stopwords
clean_train_text = (text_to_wordlist(text_review, remove_stopwords=True))

# delete words which occur more than ones
clean_train = []
for words in clean_train_text:
    if words in clean_train:
        words = +1
    else:
        clean_train.append(words)

trainDataVecs = get_feature_vec(clean_train, model)
trainData = numpy.asarray(trainDataVecs)

# calculate cosine similarity matrix to use in pagerank algorithm for dense matrix, it is not
# fast for sparse matrix
# sim_matrix = 1-pairwise_distances(trainData, metric="cosine")

# similarity matrix, it is 30 times faster for sparse matrix
# replace this with A.dot(A.T).todense() for sparse representation
similarity = numpy.dot(trainData, trainData.T)

# squared magnitude of preference vectors (number of occurrences)
square_mag = numpy.diag(similarity)

# inverse squared magnitude
inv_square_mag = 1 / square_mag

# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[numpy.isinf(inv_square_mag)] = 0

# inverse of the magnitude
inv_mag = numpy.sqrt(inv_square_mag)

# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
cosine = cosine.T * inv_mag


# pagerank powermethod
def powerMethod(A, x0, m, iter):
    n = A.shape[1]
    delta = m * (array([1] * n, dtype='float64') / n)
    for i in range(iter):
        x0 = dot((1 - m), dot(A, x0)) + delta
    return x0


n = cosine.shape[1]  # A is n x n
m = 0.15
x0 = [1] * n

pagerank_values = powerMethod(cosine, x0, m, 130)

srt = numpy.argsort(-pagerank_values)
a = srt[0:10]

keywords_list = []

for words in a:
    keywords_list.append(clean_train_text[words])
    
print(keywords_list)

## Restricting to custom vectors

In [None]:
import numpy as np

def restrict_w2v(w2v, restricted_word_set):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    new_vectors_norm = []

    for i in range(len(w2v.vocab)):
        word = w2v.index2entity[i]
        vec = w2v.vectors[i]
        vocab = w2v.vocab[word]
        vec_norm = w2v.vectors_norm[i]
        if word in restricted_word_set:
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            new_vectors_norm.append(vec_norm)

    w2v.vocab = new_vocab
    w2v.vectors = np.array(new_vectors)
    w2v.index2entity = np.array(new_index2entity)
    w2v.index2word = np.array(new_index2entity)
    w2v.vectors_norm = np.array(new_vectors_norm)

In [None]:
words = ['microsoft', 'bing', 'windows','google', 'python', 'scala', 'siemens', 'erlangen','germany','france','spain','tesla','thomas','john','chun','chan','china',
         'internet_security','internet','intel','amd','qualcomm','nvidia','cable','modem','chip_set'
         'america','us','north_america','canada','europe','germany','dortmund','munich','france','italy','aachen','austria','regensburg','bavaria','bremen',
         'electric_car','energy_storage','nissan_leaf','tesla','roadster',
         'stanford_university','university_washington','university_toronto','mit',
         'coronavirus','covid','pandemic','ebola',
         'quantum technology','astrophysics','qubit','quantum computer','electron', 'quantum',
         'industrial_ethernet', 'ethernet_protocol','kirigami','origami','sculpture','architecture','japanese',
         'robotaxi','san_francisco', 'general_motor', 'magnetic_sensor', 'flexible_electronics','organic_electronics','electronic_skin','elastic_surface','active_matrix','artificial_skin',
         'robot','artificial_intelligence', 'workplace_robot','digital_transformation','iot','connected_device','smart_home','industrial_iot','cloud_computing','vitualization','severless',
         'starbucks','cofee','carbon_nanotube','aerospace','aerospace_manufacturing','composite_manufacturing','fuselage', 'aircraft','wing','airplane',
         'honda','fuel_cell','zero_emission','hydrogen_fuel','duty_truck','sustainable_energy',
         'phishing_attack','ransomware','inbox_infiltration','phishing_email','malicious','scam','login_page','attack','fraud']


In [None]:
restricted_word_set = {"beer", "wine", "computer", "python", "bash", "lagers"}
# restricted_word_set = set(words)

new = []
for w in restricted_word_set:
  if w in w2v_model.wv.vocab:
    new.append(w)

restrict_w2v(w2v_model.wv, new)
w2v_model.wv.most_similar(positive=["cofee"])

In [None]:
# restricted_word_set = {"beer", "wine", "computer", "python", "bash", "lagers"}

restrict_w2v(w2v_model.wv, restricted_word_set)
w2v_model.wv.most_similar(positive=["cofee"])

## Extracting vector for given word

In [None]:
# extracting vectors for only given vectors
words = ['microsoft', 'bing', 'windows','google', 'python', 'scala', 'siemens', 'erlangen','germany','france','spain','tesla','thomas','john','chun','chan','china',
         'internet_security','internet','intel','amd','qualcomm','nvidia','cable','modem','chip_set'
         'america','us','north_america','canada','europe','germany','dortmund','munich','france','italy','aachen','austria','regensburg','bavaria','bremen',
         'electric_car','energy_storage','nissan_leaf','tesla','roadster',
         'stanford_university','university_washington','university_toronto','mit',
         'coronavirus','covid','pandemic','ebola',
         'quantum technology','astrophysics','qubit','quantum computer','electron', 'quantum',
         'industrial_ethernet', 'ethernet_protocol','kirigami','origami','sculpture','architecture','japanese',
         'robotaxi','san_francisco', 'general_motor', 'magnetic_sensor', 'flexible_electronics','organic_electronics','electronic_skin','elastic_surface','active_matrix','artificial_skin',
         'robot','artificial_intelligence', 'workplace_robot','digital_transformation','iot','connected_device','smart_home','industrial_iot','cloud_computing','vitualization','severless',
         'starbucks','cofee','carbon_nanotube','aerospace','aerospace_manufacturing','composite_manufacturing','fuselage', 'aircraft','wing','airplane',
         'honda','fuel_cell','zero_emission','hydrogen_fuel','duty_truck','sustainable_energy',
         'phishing_attack','ransomware','inbox_infiltration','phishing_email','malicious','scam','login_page','attack','fraud']
words[:2]

# load model

## Check dimension of word vectors
w2v_model.vector_size

# pass words thr this to get vector
# also make sure its in vocab of model: if word in model.vocab

# Filter the list of vectors to include only those that Word2Vec has a vector for
vector_list = [w2v_model[word] for word in words if word in w2v_model.wv.vocab]

# Create a list of the words corresponding to these vectors
words_filtered = [word for word in words if word in w2v_model.wv.vocab]

# Zip the words together with their vector representations
word_vec_zip = zip(words_filtered, vector_list)

# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)
import pandas as pd
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')

print(df.info())
df.head(3)


In [None]:
words = df.index
words

# df1 = df.rename_axis(None)
df1 = df.reset_index()
df2 = df1.iloc[:, 0:300]
df2.drop(columns =['index'],inplace=True)

df2

# Clustering

In [None]:
# Word Vectors for each word in the vocab,
Z = w2v_model.wv.syn0;
print(Z[0].shape)
Z[0]
# model.syn1[model.vocab[word].index]

## Setting word and corresponding vector together 

#zip the two lists containing vectors and words
zipped = zip(nmodel.wv.index2word, nmodel.wv.syn0)

#the resulting list contains `(word, wordvector)` tuples. We can extract the entry for any `word` or `vector` (replace with the word/vector you're looking for) using a list comprehension:
wordresult = [i for i in zipped if i[0] == word]
vecresult = [i for i in zipped if i[1] == vector]

In [None]:
# kmeans
from sklearn.cluster import KMeans
def clustering_on_wordvecs(word_vectors, num_clusters):
    # Initalize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans(n_clusters = num_clusters, init='k-means++');
    idx = kmeans_clustering.fit_predict(word_vectors);
    
    return kmeans_clustering.cluster_centers_, idx;

# 50 clusters
centers, clusters = clustering_on_wordvecs(Z, 50);
centroid_map = dict(zip(w2v_model.wv.index2word, clusters));


In [None]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KDTree
import numpy as np
import pandas as pd
from itertools import cycle
from wordcloud import WordCloud, ImageColorGenerator

#get words in each cluster that are closest to the cluster center.
def get_top_words(index2word, k, centers, wordvecs):
    tree = KDTree(wordvecs);
#Closest points for each Cluster center is used to query the closest 20 points to it.
    closest_points = [tree.query(np.reshape(x, (1, -1)), k=k) for x in centers];
    closest_words_idxs = [x[1] for x in closest_points];
#Word Index is queried for each position in the above array, and added to a Dictionary.
    closest_words = {};
    for i in range(0, len(closest_words_idxs)):
        closest_words['Cluster #' + str(i)] = [index2word[j] for j in closest_words_idxs[i][0]]
#A DataFrame is generated from the dictionary.
    df = pd.DataFrame(closest_words);
    df.index = df.index+1
    return df;

top_words = get_top_words(w2v_model.wv.index2word, 5000, centers, Z);

# Word cloud visualization
def display_cloud(cluster_num, cmap):
    wc = WordCloud(background_color="black", max_words=2000, max_font_size=80, colormap=cmap);
    wordcloud = wc.generate(' '.join([word for word in top_words['Cluster #' + str(cluster_num)]]))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig('cluster_' + str(cluster_num), bbox_inches='tight')

cmaps = cycle([
            'flag', 'prism', 'ocean', 'gist_earth', 'terrain', 'gist_stern',
            'gnuplot', 'gnuplot2', 'CMRmap', 'cubehelix', 'brg', 'hsv',
            'gist_rainbow', 'rainbow', 'jet', 'nipy_spectral', 'gist_ncar'])
for i in range(50):
    col = next(cmaps);
    display_cloud(i, col)


In [None]:
# deleting the png files
!rm /content/cluster_{0..49}.png

In [None]:
pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

from google.colab import data_table
data_table.DataTable(top_words)
# top_words

# Visualization

## Tsne

In [None]:
# t-sne visualization
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

tsne_plot(w2v_model)

## Tensorboard

In [None]:
# /content/drive/My Drive/RokinData/models/word2vec.model
w2v_model.wv.save_word2vec_format('/content/logs/myW2V')

In [None]:
!python -m gensim.scripts.word2vec2tensor -i /content/logs/myW2V -o /content/logs/Mymodel 

In [None]:
import io
# ONE LINER
# !python -m gensim.scripts.word2vec2tensor -i ~/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz 

w2v = Word2Vec.load("/content/drive/My Drive/RokinData/models/word2vec_50k.model")
# Vector file, `\t` seperated the vectors and `\n` seperate the words
"""
0.1\t0.2\t0.5\t0.9
0.2\t0.1\t5.0\t0.2
0.4\t0.1\t7.0\t0.8
"""
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')

# Meta data file, `\n` seperated word
"""
token1
token2
token3
"""
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Write meta file and vector file
for index in range(len(w2v.wv.index2word)):
    word = w2v.wv.index2word[index]
    vec = w2v.wv.vectors[index]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()


# Then we can visuale using the `http://projector.tensorflow.org/` to visualize those two files.

# 1. Open the Embedding Projector.
# 2. Click on "Load data".
# 3. Upload the two files we created above: vecs.tsv and meta.tsv.

### Experiment: visualize in colab

In [None]:
# test 1
# youtube similar to abv, remaining below
# model = KeyedVectors.load_word2vec_format('/content/drive/My Drive/RokinData/models/word2vec.model')
import os

tsv_file_path = "/content/tensorboard/metadata.tsv"
path = '/content/tensorboard'

model = gensim.models.keyedvectors.KeyedVectors.load('/content/drive/My Drive/RokinData/models/word2vec.model')
max_size = len(model.wv.vocab)-1
w2v = np.zeros((max_size,model.layer1_size))

if not os.path.exists('tensorboard'):
 os.makedirs('tensorboard')

with open(tsv_file_path,'w+') as file_metadata:
    for i,word in enumerate(model.wv.index2word[:max_size]):
        w2v[i] = model.wv[word]
        file_metadata.write(word+'\n')

sess = tf.InteractiveSession()

with tf.device("/cpu:0"):
  embedding = tf.Variable(w2v, trainable=False, name='embedding')

tf.global_variables_initializer().run()


saver = tf.train.Saver()

writer= tf.summary.FileWriter(path, sess.graph)

config=projector.ProjectorConfig()
embed=config.embedding.add()
embed.tensor_name = 'embeddings'
embed.metadata_path = 'metadata.tsv' #/content/tensorboard/metadata.tsv


projector.visualize_embeddings(writer,config)
saver.save(sess,path + '/model.ckpt', global_step=max_size)

# now take terminal tensorboard --logdir= '..../tensorboard' --port =8080
# http://localhost:8080/

# go to tensorboard projector

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
# %load_ext tensorboard.notebook

import tensorflow as tf
import datetime

import gensim
from gensim.models import Word2Vec,KeyedVectors


# load the model
model = KeyedVectors.load_word2vec_format('/content/drive/My Drive/RokinData/models/GoogleNews-vectors-negative300-SLIM.bin', binary=True)

print("Vocabulary Size: {0}".format(len(model.vocab)))
model["for"].shape


# numpy array to store vocab
import numpy as np
#Important Parameters
VOCAB_SIZE = len(model.vocab)
EMBEDDING_DIM = model["is"].shape[0]
w2v = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

tsv_file_path = "/content/tensorboard/metadata.tsv"
with open(tsv_file_path,'w+', encoding='utf-8') as file_metadata:
    for i,word in enumerate(model.index2word[:VOCAB_SIZE]):
        w2v[i] = model[word]
        file_metadata.write(word+'\n')

import tensorflow as tf
# from tensorflow.contrib.tensorboard.plugins import projector
from tensorboard.plugins import projector
TENSORBOARD_FILES_PATH = "/content/tensorboard/tensorboard"

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

#Tensorflow Placeholders
X_init = tf.placeholder(tf.float32, shape=(VOCAB_SIZE, EMBEDDING_DIM), name="embedding")
X = tf.Variable(X_init)
#Initializer
init = tf.global_variables_initializer()
#Start Tensorflow Session
sess = tf.Session()
sess.run(init, feed_dict={X_init: w2v})
#Instance of Saver, save the graph.
saver = tf.train.Saver()
writer = tf.summary.FileWriter(TENSORBOARD_FILES_PATH, sess.graph)


#Configure a Tensorflow Projector
config = projector.ProjectorConfig()
embed = config.embeddings.add()
embed.metadata_path = tsv_file_path
#Write a projector_config
projector.visualize_embeddings(writer,config)
#save a checkpoint
saver.save(sess, TENSORBOARD_FILES_PATH+'/model.ckpt', global_step = VOCAB_SIZE)
#close the session
sess.close()

In [None]:
!python -m tensorboard.main --logdir='/content/tensorboard'

In [None]:
!pip install tb-nightly

import numpy as np
from torch.utils.tensorboard import SummaryWriter

import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

vectors = np.array([[0,0,1], [0,1,0], [1,0,0], [1,1,1]])
metadata = ['001', '010', '100', '111']   # labels

writer = SummaryWriter()
writer.add_embedding(vectors, metadata)
writer.close()

# !kill 444
%load_ext tensorboard
%tensorboard --logdir=runs

# Pre trained models

In [None]:
import gensim

# !gunzip '/content/drive/My Drive/RokinData/models/GoogleNews-vectors-negative300-SLIM.bin.gz'
# Load Google's pre-trained Word2Vec model.

model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/RokinData/models/GoogleNews-vectors-negative300-SLIM.bin', binary=True)


In [None]:
print(model.most_similar("linux"))

In [None]:
# /content/drive/My Drive/RokinData/models/glove.6B.300d.txt

from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="/content/drive/My Drive/RokinData/models/glove.6B.300d.txt", word2vec_output_file="/content/drive/My Drive/RokinData/models/gensim_glove_vectors.txt")

from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("/content/drive/My Drive/RokinData/models/gensim_glove_vectors.txt", binary=False)

print(glove_model.most_similar("apple"))

## add new data to pre trained model

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

model = KeyedVectors.load_word2vec_format("/content/drive/My Drive/RokinData/models/gensim_glove_vectors.txt", binary=False)

# train actually but here we use pre saved
my_model = Word2Vec.load("/content/drive/My Drive/RokinData/models/word2vec.model")

# ------> if training
# my_model = Word2Vec(size=300, min_count=1)
# my_model.build_vocab(sentences)
# total_examples = my_model.corpus_count

total_examples = my_model.corpus_count
my_model.build_vocab([list(model.vocab.keys())], update=True)
my_model.intersect_word2vec_format("/content/drive/My Drive/RokinData/models/gensim_glove_vectors.txt", binary=False, lockf=1.0)

my_model.train(sentences, total_examples=total_examples, epochs=my_model.iter)

In [None]:
my_model.wv.most_similar(positive=["microsoft"])