# Learn Embeddings
This Notebook is training word2Vec embeddings (cbow and Skipgram) and evaluates them. 

In [None]:
import os
import csv
#import spacy
import multiprocessing
import time
from gensim.models import Word2Vec
from gensim.models import FastText
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pickle
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Import the Time series data

In [None]:
timeseries_data = pq.read_table('Cohort/Time_Series/all_time_series_medium_timeseries_data_per_patient.parquet').to_pandas()


In [None]:
timeseries_data

In [None]:
# Convert numerical list into srings
timeseries_data_string = timeseries_data.unique_concept.apply(lambda s: list(map(str, s) ))
timeseries_data_string


In [None]:
for i in timeseries_data_string.head(): 
    print(i)

In [None]:
model_dir = "Cohort/Time_Series/Medium/"

# Cbow

In [None]:
# params for word embedding models
num_cores = multiprocessing.cpu_count()
emb_dimension = 20
min_word_count = 0
num_window = 5

In [None]:
start = time.time()

sentences = timeseries_data_string
sentences
model = Word2Vec(sentences=sentences, size=emb_dimension, window=num_window, min_count=min_word_count, workers=num_cores, sg=0)

end = time.time()
print('Processing time in sec: ', end - start)

model.save(model_dir + 'cbow_dim{}_win{}_mc{}.bin'.format(emb_dimension, num_window, min_word_count))

In [None]:
#Time_Series/Medium
cbow_model0 = Word2Vec.load(model_dir + "cbow_dim20_win5_mc0.bin")

In [None]:
#load dictionary tall_time_series_woProcedures_dictionary.parquet
dic=pq.read_table('Cohort/Time_Series/all_time_series_medium_dictionary.parquet').to_pandas()

In [None]:
dic


In [None]:
#get the neighbours of a specific concept and merge term id with term 
near=pd.DataFrame(cbow_model0.wv.most_similar('231'), columns=["term_id", "similarity"])
near['term_id'] = near['term_id'].apply(pd.to_numeric)
dic['term_id'] = dic['term_id'].apply(pd.to_numeric)

In [None]:
#display neighbours
df_merge_col = pd.merge(near, dic, on='term_id')
df_merge_col


In [None]:
#visualize the embedding
X = cbow_model0[cbow_model0.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(40,20))
plt.scatter(result[:, 0], result[:, 1])
#words = list(cbow_model0.wv.vocab)
terms=dic['Term']
for i, word in enumerate(terms):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()

# Skipgram

In [None]:
start = time.time()

model = Word2Vec(sentences=sentences, size=emb_dimension, window=num_window, min_count=min_word_count, workers=num_cores, sg=1)

end = time.time()
print('Processing time in sec: ', end - start)

model.save(model_dir + 'skipgram_dim{}_win{}_mc{}.bin'.format(emb_dimension, num_window, min_word_count))

In [None]:
skipgram_model0 = Word2Vec.load(model_dir + "skipgram_dim50_win5_mc0.bin")

In [None]:
#load dictionary 
dic=pq.read_table('Cohort/Time_Series/all_time_series_woProcedures_dictionary.parquet').to_pandas()

In [None]:
#get the neighbours of a specific concept and merge term id with term 
near=pd.DataFrame(skipgram_model0.wv.most_similar('108'), columns=["term_id", "similarity"])
near['term_id'] = near['term_id'].apply(pd.to_numeric)
dic['term_id'] = dic['term_id'].apply(pd.to_numeric)

In [None]:
#display neighbours
df_merge_col = pd.merge(near, dic, on='term_id')
df_merge_col


In [None]:
#visualize the embedding
X = skipgram_model0[skipgram_model0.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
plt.figure(figsize=(20,10))
plt.scatter(result[:, 0], result[:, 1])
#words = list(cbow_model0.wv.vocab)
terms=dic['Term']
for i, word in enumerate(terms):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()

# Fasttext

In [None]:
start = time.time()
# 51,000,000 rows need around 27.5k seconds
model = FastText(sentences=sentences, size=emb_dimension, window=num_window, min_count=min_word_count, workers=num_cores)
end = time.time()
print('Processing time in sec: ', end - start)

model.save(model_dir + 'fastText_dim{}_win{}_mc{}.bin'.format(emb_dimension, num_window, min_word_count))