# Skip gram and CBOW


We will built the Skipgram and CBOW models from scratch, train them on a relatively small corpus, i.e, on BBC Data set.

## Lib

In [1]:

import numpy as np

import tensorflow as tf
import operator
from tensorflow import keras
import keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Snape\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Snape\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read file

In [3]:
df = pd.read_csv('bbc-text.csv')
print(df)
articles = list(df['text'])

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
1451       business  egypt to sell off state-owned bank the egyptia...
1452           tech  spam e-mails tempt net shoppers computer users...
1453           tech  broadband set to revolutionise tv bt is starti...
1454           tech  can yahoo dominate next decade  yahoo has reac...
1455       politics  lib dems target first-time buyers the liberal ...

[1456 rows x 2 columns]


In [4]:
len(articles[0])

2356

## Function

In [5]:
def loadFile(filePath):
    word_vectors = {}
    i = 0
    with open(filePath, "r", encoding="utf-8") as file:
        for line in file:
            i += 1
            if i == 1:
                continue  # Skip the header
            parts = line.strip().split()
            word = parts[0]
            vector = np.array([float(x) for x in parts[1:]])
            word_vectors[word] = vector
    return word_vectors

def nearestWords(target_word, word_vectors, top_n=10):
    if target_word not in word_vectors:
        return f"Word '{target_word}' not found in the word vectors."
    
    target_vector = word_vectors[target_word].reshape(1, -1)
    similarities = {}
    
    for word, vector in word_vectors.items():
        if word == target_word:
            continue
        similarity = cosine_similarity(target_vector, vector.reshape(1, -1))[0][0]
        similarities[word] = similarity
    
    # Sort by similarity in descending order and get the top_n words
    nearest_words = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:top_n]
    
    return nearest_words


## Preprocessing


In [6]:
# Regular expressions for cleaning text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')  # Remove characters in this set with a space
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')           # Remove any character that is not alphanumeric or a space
STOPWORDS = set(stopwords.words('english'))            # Set of English stopwords

# Function to prepare text data
def text_prepare(text):
    text = text.lower()                                # Convert text to lowercase
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text)      # Replace specified characters with a space
    text = re.sub(BAD_SYMBOLS_RE, '', text)            # Remove symbols not allowed
    token_words = word_tokenize(text)                  # Tokenize the text into words
    filtered_words = [word for word in token_words if word not in STOPWORDS]  # Remove stopwords
    text = ''
    for word in filtered_words:                       
        if word != filtered_words[len(filtered_words)-1]:  # If not the last word, add a space after the word
            text = text + word + ' '
        else:                                          # If it's the last word, don't add a space after it
            text = text + word
    return text  # Return the cleaned text

In [9]:
%%time

sentences = []

for i in articles[:]:
    sentences += i.split('.')


corpus = [sentence for sentence in sentences if sentence.count(" ") >= 5]

# Apply text_prepare to each sentence in corpus
corpus_cleaned = [text_prepare(sentence) for sentence in corpus]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus_cleaned)

# Convert text to sequence of integer values
corpus_sequences = tokenizer.texts_to_sequences(corpus_cleaned)

n_samples = sum(len(s) for s in corpus_sequences) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

CPU times: total: 516 ms
Wall time: 5.48 s


In [10]:
len(sentences)

30205

In [11]:
len(corpus)

27552

In [12]:
corpus[0]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time'

In [13]:
n_samples, V

(324119, 27442)

In [14]:
# Example of how word to integer mapping looks like in the tokenizer
print(list((tokenizer.word_index.items()))[:5])

[('said', 1), ('mr', 2), ('would', 3), ('also', 4), ('people', 5)]


In [15]:
corpus_sequences[0]

[93,
 142,
 1075,
 1039,
 45,
 968,
 711,
 6803,
 1440,
 4851,
 110,
 154,
 3623,
 1383,
 1216,
 1484,
 38,
 5,
 930,
 93,
 5222,
 342,
 94,
 18,
 16]

In [16]:
print(len(corpus_cleaned))
print(len(corpus_sequences))

27552
27552


In [17]:
# Parameters
window_size = 2
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)


## Skip gram

In [18]:
# Assuming corpus_sequences is obtained from tokenizer.texts_to_sequences(corpus_cleaned)

# Now, let's generate skipgram data using the tokenized sequences
def generate_data_skipgram(corpus_sequences, window_size, V):
    # Calculate the maximum length of each part of the window
    # Initialize lists to store input words and their context words
    all_in = []
    all_out = []
    
    # Loop through each sequence of tokens in the corpus
    for words in corpus_sequences:
        # Get the length of the sequence
        L = len(words)
        
        # Iterate over each word in the sequence
        for index, word in enumerate(words):
            # Calculate the start and end indices of the context window
            p = index - window_size
            n = index + window_size + 1
            
            # Iterate over each word in the context window
            for i in range(p, n):
                # Check if the word is not the target word and is within the sequence
                if i != index and 0 <= i < L:
                    # Add the target word to the input list
                    all_in.append(word)
                    # Add the one-hot representation of the context word to the output list
                    all_out.append(to_categorical(words[i], V))

    # Convert the lists to numpy arrays and return
    return (np.array(all_in), np.array(all_out))


In [19]:
%%time

# Create training data
# Now, let's generate skipgram data
X_skip, y_skip = generate_data_skipgram(corpus_sequences, window_size, V)
X_skip.shape, y_skip.shape


In [16]:
print(X_skip[0:13])
print(y_skip[0:13])

[ 147  147  298  298  298 2035 2035 2035 2035  701  701  701  701]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
%%time

# Create skipgram architecture

dim = 300
skipgram_models = []

# Initialize a Keras Sequential model
skipgram = Sequential()

# Add an Embedding layer
skipgram.add(Embedding(input_dim=V,
                        output_dim=dim,
                        input_length=1,
                        embeddings_initializer='glorot_uniform'))

# Add a Reshape layer, which reshapes the output of the embedding layer (1,dim) to (dim,)
skipgram.add(Reshape((dim, )))

# Add a final Dense layer with the same size as in [1]
skipgram.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

# Compile the model with a suitable loss function and select an optimizer.
# Optimizer Adagrad was used in paper
skipgram.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

skipgram.summary()
print("")
skipgram_models.append(skipgram)




CPU times: total: 297 ms
Wall time: 1.38 s


In [18]:
%%time
# Training the skipgram models
for skipgram in skipgram_models:
    skipgram.fit(X_skip, y_skip, batch_size=64, epochs=10, verbose=1)
    print("")

Epoch 1/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 103ms/step - accuracy: 0.0131 - loss: 8.7858
Epoch 2/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 65ms/step - accuracy: 0.0239 - loss: 7.9557
Epoch 3/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 43ms/step - accuracy: 0.0486 - loss: 7.0382
Epoch 4/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 71ms/step - accuracy: 0.0668 - loss: 5.9886
Epoch 5/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 77ms/step - accuracy: 0.0711 - loss: 5.1330
Epoch 6/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 27ms/step - accuracy: 0.0654 - loss: 4.5881
Epoch 7/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 46ms/step - accuracy: 0.0619 - loss: 4.2933
Epoch 8/10
[1m2264/2264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 82ms/step - accuracy: 0.0576 - loss: 4.1314
E

In [19]:

for skipgram in skipgram_models:
    # Save embeddings for vectors of length 50, 150 and 300 using skipgram model
    weights = skipgram.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f"vectors_skipgram_{len(embedding[0])}.txt", "w")

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))

    # Start a new line
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [20]:
skipgram.get_weights()[0]

array([[ 0.01748095,  0.00148425, -0.01919544, ..., -0.00088694,
        -0.01220941, -0.0135742 ],
       [ 0.32527888, -0.12163258,  0.16216542, ..., -0.18126906,
         0.12671961, -0.10615522],
       [ 0.42963627,  0.21860029,  0.03769287, ...,  0.31626388,
        -0.19132   , -0.01692916],
       ...,
       [-0.13268201, -0.2621697 , -0.13941467, ...,  0.3050503 ,
        -0.06711472, -0.07819138],
       [ 0.2811083 ,  0.11811126,  0.06614776, ..., -0.1773975 ,
         0.20586096, -0.14225566],
       [ 0.20672405, -0.08174837,  0.00601677, ..., -0.03199767,
         0.42297745,  0.11940225]], dtype=float32)

In [21]:
len(skipgram.get_weights())

3

In [22]:
len(skipgram.get_weights()[0])

9499

In [23]:
len(skipgram.get_weights()[0][0])

300

In [24]:
skipgram.get_weights()[0][1]

array([ 0.32527888, -0.12163258,  0.16216542,  0.04127095, -0.11305285,
        0.19053659,  0.08194021,  0.0827537 , -0.09798235, -0.12230548,
       -0.10925728, -0.4509574 ,  0.10556727, -0.06375656,  0.03762622,
       -0.29934317, -0.11401577, -0.21341637,  0.15974396, -0.26582593,
       -0.08370542, -0.22188632,  0.09298911,  0.12538494, -0.11680766,
       -0.211973  , -0.05952676, -0.13551821, -0.23344664,  0.11647146,
        0.577134  ,  0.10741807, -0.1729161 , -0.2584268 , -0.05339617,
       -0.00989427, -0.12786208,  0.11358162,  0.36213025, -0.16353655,
       -0.06297044,  0.2722588 ,  0.02155973,  0.32286486, -0.24523337,
       -0.19401504, -0.03977433, -0.04972154, -0.3587838 ,  0.29570627,
       -0.16750993, -0.17616794, -0.09405033, -0.14426138,  0.43670478,
        0.18446957, -0.09198372, -0.14051612, -0.3323797 ,  0.02338192,
        0.21787195,  0.5718953 ,  0.18932514,  0.11112643, -0.11459134,
       -0.14072517, -0.36717772,  0.27305597,  0.24377222,  0.16

To get the word embedding:

In [25]:
index = tokenizer.word_index['king']

In [26]:
skipgram.get_weights()[0][index]

array([-0.2885331 ,  0.51821667, -0.17127907,  0.2747276 , -0.01620254,
        0.498448  ,  0.26440337, -0.47260427,  0.10990763,  0.04294791,
       -0.08642849, -0.11361936, -0.6033893 , -0.10924914,  0.1353534 ,
        0.07647365, -0.0427232 , -0.17580122,  0.09824131, -0.14217344,
        0.31400818, -0.05618232, -0.2700428 , -0.19324386,  0.17674293,
        0.01983271, -0.08673499,  0.3545664 ,  0.36137933,  0.1838298 ,
       -0.19554509, -0.17966843, -0.37718725, -0.23053116,  0.3574334 ,
        0.2850941 ,  0.15157783, -0.22383724,  0.23521915,  0.41887844,
       -0.0097496 , -0.09284983,  0.454733  ,  0.03409092,  0.58624476,
        0.06892834, -0.16144466,  0.40544748, -0.59082067, -0.0590856 ,
       -0.05896838,  0.14168249,  0.13438728,  0.03637302,  0.37622723,
       -0.06925197,  0.33077374, -0.5664501 , -0.15090407, -0.22181164,
        0.0279519 , -0.4537112 , -0.11181034, -0.39456737,  0.0290822 ,
       -0.37076977, -0.21548012,  0.3339463 ,  0.03860397,  0.21

In [51]:
# Load the word vectors
word_vectors_skipGram = loadFile("vectors_skipgram_300.txt")

# Find nearest words to the target word
target_word = "king"
nearest_words_skipGram = nearestWords(target_word, word_vectors_skipGram)

print(f"Nearest words to '{target_word}':")
for word, similarity in nearest_words_skipGram:
    print(f"{word}: {similarity:.4f}")

Nearest words to 'king':
fisher: 0.4934
supreme: 0.4713
confirms: 0.4438
teen: 0.4421
mervyn: 0.4368
adaptation: 0.4322
abdullah: 0.4096
zone: 0.4055
adventures: 0.3999
tote: 0.3862


## CBOW

In [28]:
# The function returns two arrays: all_in, which contains the context words, 
# and all_out, which contains the corresponding one-hot encoded target words.

def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))

    return (np.array(all_in), np.array(all_out))

In [29]:
%%time
# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus_sequences, window_size, V)
X_cbow.shape, y_cbow.shape

CPU times: total: 656 ms
Wall time: 11.4 s


((41552, 4), (41552, 9499))

In [30]:
print(X_cbow[:10])
print(y_cbow[:10])


[[   0    0  298 2035]
 [   0  147 2035  701]
 [ 147  298  701   66]
 [ 298 2035   66  777]
 [2035  701  777 1430]
 [ 701   66 1430 4878]
 [  66  777 4878 2524]
 [ 777 1430 2524 4879]
 [1430 4878 4879  299]
 [4878 2524  299  322]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [31]:
%%time

# Create the CBOW architecture
cbow_models = []
dim = 300
cbow = Sequential()

# Add an Embedding layer
cbow.add(Embedding(input_dim=V,
                    output_dim=dim,
                    input_length=window_size*2, # Note that we now have 2L words for each input entry
                    embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(dim, )))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

cbow.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

cbow.summary()
print("")
cbow_models.append(cbow)




CPU times: total: 0 ns
Wall time: 264 ms


In [32]:
# K.__dict__

In [33]:
%%time

# Train CBOW model
for cbow in cbow_models:
    cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=20, verbose=1)
    print("")


Epoch 1/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 29ms/step - accuracy: 0.0132 - loss: 8.8946
Epoch 2/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 71ms/step - accuracy: 0.0164 - loss: 8.1925
Epoch 3/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 94ms/step - accuracy: 0.0260 - loss: 7.9483
Epoch 4/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 86ms/step - accuracy: 0.0418 - loss: 7.6224
Epoch 5/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 28ms/step - accuracy: 0.0684 - loss: 7.1334
Epoch 6/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.0996 - loss: 6.4569
Epoch 7/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.1451 - loss: 5.6724
Epoch 8/20
[1m650/650[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.2075 - loss: 4.8524
Epoch 9/20
[1m650/650[

In [34]:
for cbow in cbow_models:

    weights = cbow.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f'vectors_cbow_{len(embedding[0])}.txt', 'w')

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [49]:
# Load the word vectors
word_vectors_cbow = loadFile("vectors_cbow_300.txt")

# Find nearest words to the target word
target_word = "king"
nearest_words_cbow = nearestWords(target_word, word_vectors_cbow)

print(f"Nearest words to '{target_word}':")
for word, similarity in nearest_words_cbow:
    print(f"{word}: {similarity:.4f}")

Nearest words to 'king':
fisher: 0.3907
oscar: 0.3745
nominee: 0.3631
supreme: 0.3285
mervyn: 0.3263
teen: 0.3255
confirms: 0.3195
oneman: 0.3066
tote: 0.2919
midbedfordshire: 0.2907


To get the word embedding:

In [37]:
len(skipgram_word_emd),len(cbow_word_emd)

(9498, 9498)

In [38]:
cosine_similarity([skipgram_word_emd['king']], [cbow_word_emd['king']])

array([[-0.04811318]])

In [39]:
cosine_similarity([skipgram_word_emd['queen']], [cbow_word_emd['queen']])

array([[0.0732749]])

In [40]:
cosine_similarity([skipgram_word_emd['king']], [skipgram_word_emd['queen']])

array([[0.15288118]])

In [41]:
cosine_similarity([cbow_word_emd['king']], [cbow_word_emd['queen']])

array([[0.04786081]])

## Glove


In [55]:
def load_glove_model(glove_file):
    print("Loading GloVe vectors...")
    with open(glove_file, 'r', encoding='utf-8') as f:
        word_embeddings = {}
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
    print(f"Total words in GloVe: {len(word_embeddings)}")
    return word_embeddings

def get_word_vector(word, embeddings):
    print(f"Total words in GloVe: {len(embeddings)}")
    return embeddings.get(word, None)


In [56]:
# Load GloVe vectors
glove_file = 'pre-train\glove.6B.50d.txt'
word_embeddings = load_glove_model(glove_file)

# Get vector for a specific word
word = 'example'
vector = get_word_vector(word, word_embeddings)
print(f"Vector for '{word}': {vector}")

Loading GloVe vectors...
Total words in GloVe: 400001
Total words in GloVe: 400001
Vector for 'example': [ 0.51564    0.56912   -0.19759    0.0080456  0.41697    0.59502
 -0.053312  -0.83222   -0.21715    0.31045    0.09352    0.35323
  0.28151   -0.35308    0.23496    0.04429    0.017109   0.0063749
 -0.01662   -0.69576    0.019819  -0.52746   -0.14011    0.21962
  0.13692   -1.2683    -0.89416   -0.1831     0.23343   -0.058254
  3.2481    -0.48794   -0.01207   -0.81645    0.21182   -0.17837
 -0.02874    0.099358  -0.14944    0.2601     0.18919    0.15022
  0.18278    0.50052   -0.025532   0.24671    0.10596    0.13612
  0.0090427  0.39962  ]


In [48]:
# Load the word vectors
word_vectors = loadFile("pre-train/glove.6B.50d.txt")

# Find nearest words to the target word
target_word = "king"
nearest_words = nearestWords(target_word, word_vectors)

print(f"Nearest words to '{target_word}':")
for word, similarity in nearest_words:
    print(f"{word}: {similarity:.4f}")


Nearest words to 'king':
prince: 0.8236
queen: 0.7839
ii: 0.7746
emperor: 0.7736
son: 0.7667
uncle: 0.7627
kingdom: 0.7542
throne: 0.7540
brother: 0.7492
ruler: 0.7434


## FastText

In [58]:
# Save corpus_cleaned to data.txt
with open('data.txt', 'w') as f:
    for sentence in corpus_cleaned:
        f.write(sentence + '\n')

In [70]:
import fasttext

# Skipgram model :
model_skipgram = fasttext.train_unsupervised('data.txt', model='skipgram')
model_cbow = fasttext.train_unsupervised('data.txt', model='cbow')

In [69]:
print(len(model_skipgram.words))   # list of words in dictionary
print(model_skipgram.words[:5])   # list of words in dictionary

2035
['</s>', 'said', 'mr', 'would', 'us']


In [71]:
print(len(model_cbow.words))   # list of words in dictionary
print(model_cbow.words[:5])   # list of words in dictionary

2035
['</s>', 'said', 'mr', 'would', 'us']


In [64]:
print(model_skipgram['king']) # get the vector of the word 'king'

[-0.12809202 -0.06241307 -0.5373109  -0.07898965 -0.17456539 -0.01499808
 -0.04510237 -0.12413687  0.11719195  0.21051577 -0.06993252  0.31800944
  0.40157363  0.01828186 -0.0802449   0.01835513  0.18282847 -0.03645166
  0.23834257 -0.17484272 -0.24756224  0.03784261 -0.06942379  0.08270446
  0.19014034 -0.0667494   0.0468164  -0.05474981 -0.12270254  0.01528442
  0.19886202 -0.01764523 -0.23059776 -0.06587476  0.01497575  0.25735182
  0.05171872 -0.10021929 -0.4172717  -0.09823438  0.24158943  0.1446235
  0.11621234 -0.16488473  0.13652135 -0.09762552 -0.18743338  0.02908867
 -0.02417079 -0.07592172 -0.10909104  0.3557273   0.04537655  0.12654407
  0.05257778 -0.29432368  0.07294812  0.07641889  0.01687525 -0.08029394
 -0.39478365  0.09077107  0.17239797  0.12128054 -0.14266208 -0.23614943
 -0.11491174  0.05627097  0.3311356  -0.00069061  0.03242769 -0.01866153
 -0.03778111  0.34889865 -0.06612207 -0.05382381  0.37602648  0.1790597
  0.06545603 -0.20696251  0.08220818  0.05382765  0.0

In [72]:
print(model_cbow['king']) # get the vector of the word 'king'

[-0.0723238  -0.16722362 -0.9487884  -0.0874353  -0.5591405  -0.22758369
  0.07761426 -0.5468157  -0.09940009  0.14080796 -0.22445373  0.60899127
  0.76282626 -0.25866792 -0.25519374  0.03502591  0.54408073 -0.16598192
  0.50998914 -0.23766646 -0.70327634 -0.13285232  0.12252723 -0.02771314
  0.45805734  0.06522045  0.28230363 -0.18451302 -0.1954119  -0.4568035
  0.6430413  -0.30580673 -0.05302715 -0.3639861   0.35608926  0.5884223
  0.24000968  0.06901143 -1.0804046  -0.15920348  0.1401544  -0.10084464
 -0.11501204 -0.23083207  0.34574363 -0.04043501 -0.31826505  0.08892837
 -0.06327876 -0.23940556  0.02021915  0.59051436  0.15839747  0.53370696
  0.19323735 -0.88662004  0.12882155  0.21580972 -0.16266775  0.05295403
 -0.69453114  0.144061    0.24484323 -0.01265344 -0.22165284 -0.74223036
 -0.18441418 -0.03459466  0.42404076 -0.32759324  0.60522413 -0.30698386
  0.10658323  0.64038175  0.06307451 -0.04499795  0.66419554  0.34348848
  0.4633894  -0.28266683  0.4564947  -0.12463069 -0.2

In [66]:
model_skipgram.get_nearest_neighbors('kings')

[(0.9998900294303894, 'evening'),
 (0.9998785853385925, 'planning'),
 (0.9998743534088135, 'blogging'),
 (0.9998684525489807, 'helping'),
 (0.9998652338981628, 'talking'),
 (0.9998651146888733, 'turning'),
 (0.9998628497123718, 'watching'),
 (0.9998627305030823, 'running'),
 (0.9998591542243958, 'downing'),
 (0.9998563528060913, 'encouraging')]

In [73]:
model_cbow.get_nearest_neighbors('kings')

[(0.999973714351654, 'king'),
 (0.9999666213989258, 'evening'),
 (0.9999646544456482, 'going'),
 (0.9999641180038452, 'running'),
 (0.9999632835388184, 'planning'),
 (0.9999628067016602, 'making'),
 (0.999962568283081, 'starting'),
 (0.9999619722366333, 'winning'),
 (0.9999617338180542, 'writing'),
 (0.9999616742134094, 'starring')]

## 

In [None]:
# Save corpus_cleaned to data.txt
with open('pre-train/glove.6B.50d.txt', 'w') as f:
    for sentence in corpus_cleaned:
        f.write(sentence + '\n')