In [1]:
import os, sys

import random
import numpy as np
import pandas as pd
import gc


In [2]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [2]:
root = os.path.dirname(os.getcwd())
sys.path.append(root)
df = pd.read_csv(root + os.sep + 'data'+ os.sep + 'BASE.csv')

In [3]:
from src.utils.models import Preprocessor

In [4]:
df

Unnamed: 0.1,Unnamed: 0,title,quote,ID,Year,Genre
0,0,10 things i hate about you,Who knocked up your sister?,tt0147800,1999,"Comedy,Drama,Romance"
1,1,10 things i hate about you,"I was watching you out there, before. I've nev...",tt0147800,1999,"Comedy,Drama,Romance"
2,2,10 things i hate about you,"You're 18, you don't know what you want. And y...",tt0147800,1999,"Comedy,Drama,Romance"
3,3,10 things i hate about you,"Ooh, see that, there. Who needs affection when...",tt0147800,1999,"Comedy,Drama,Romance"
4,4,10 things i hate about you,"Just 'cause you're beautiful, that doesn't mea...",tt0147800,1999,"Comedy,Drama,Romance"
...,...,...,...,...,...,...
5949,5949,the lord of the rings: the fellowship of the ring,"If by my life or death I can protect you, I wi...",tt0120737,2001,"Action,Adventure,Drama"
5950,5950,the lord of the rings: the return of the king,Certainty of death. Small chance of success. W...,tt0167260,2003,"Action,Adventure,Drama"
5951,5951,the lord of the rings: the return of the king,The journey doesn't end here. Death is just an...,tt0167260,2003,"Action,Adventure,Drama"
5952,5952,the lord of the rings: the return of the king,I see in your eyes the same fear that would ta...,tt0167260,2003,"Action,Adventure,Drama"


In [11]:
len(df[3954:])

2000

In [4]:
p = Preprocessor(df)

In [19]:
len(set(p.text))

52

In [18]:
p.preprocess(mode='gan')

Corpus length: 630844
Total chars: 52
Number of sequences: 210255


In [22]:
p.preprocess(option='word', mode='gan', min_word_frequency=2)

Corpus length: 630844
Total words: 118223
Unique words before ignoring: 11540
Ignoring words with frequency < 2
Unique words after ignoring: 5173
Number of sequences: 1659


In [23]:
p.X.shape

(1659, 40, 5173)

In [24]:
p.Y.shape

(1659, 40, 5173)

In [24]:
# Cargamos modelo
model = tf.keras.models.load_model(root + os.sep + 'models' + os.sep + 'Base_Quote_Generator.h5')

In [55]:
word_model = tf.keras.models.load_model(root + os.sep + 'models' + os.sep + 'Word_Base_Quote_Generator.h5')

In [45]:
cp.generate(model, quote_len=1, verbose=False)

'i'

In [60]:
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jgnsa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [78]:
len(STOP_WORDS)

6800

## 1. Character-Based Preprocessing

In [15]:
class CharacterPreprocessor:

    def __init__(self, df):
        self.data = df

    def get_corpus(self, quote_list):
        # Corpus
        text = ''
        for q in quote_list:
            text += ' ' + q
        print("Corpus length:", len(text))
        
        return text.lower()

    def preprocess(self, maxlen=40, step=3, column='quote'):

        self.maxlen = maxlen

        # Quote List
        input_list = list(self.data[column])

        # Corpus
        text = self.get_corpus(input_list)

        # Total Characters
        chars = sorted(list(set(text)))
        print("Total chars:", len(chars))

        # Dictionaries
        char_indices = dict((c, i) for i, c in enumerate(chars))
        indices_char = dict((i, c) for i, c in enumerate(chars))

        # Number of sequences
        sentences = []
        next_chars = []
        for i in range(0, len(text) - maxlen, step):
            sentences.append(text[i : i + maxlen])
            next_chars.append(text[i + maxlen])
        print("Number of sequences:", len(sentences))

        # Defining X and y
        x = np.zeros((len(sentences), maxlen, len(chars)))
        y = np.zeros((len(sentences), len(chars)))
        for i, sentence in enumerate(sentences):
            for t, char in enumerate(sentence):
                x[i, t, char_indices[char]] = 1
            y[i, char_indices[next_chars[i]]] = 1

        self.text = text
        self.chars = chars
        self.char_indices = char_indices
        self.indices_char = indices_char
        self.sentences = sentences
        self.next_chars = next_chars
        self.X = x
        self.Y = y

    def preprocess_type2(self, maxlen=40, step=3, column='quote'):

        self.maxlen = maxlen

        # Quote List
        input_list = list(self.data[column])

        # Corpus
        text = self.get_corpus(input_list)

        # Total Characters
        chars = sorted(list(set(text)))
        print("Total chars:", len(chars))

        # Dictionaries
        char_indices = dict((c, i) for i, c in enumerate(chars))
        indices_char = dict((i, c) for i, c in enumerate(chars))

        # Number of sequences
        sentences = []
        next_seq = []
        for i in range(0, len(text) - 2*maxlen, step):
            sentences.append(text[i : i + maxlen])
            next_seq.append(text[i + maxlen:i + 2*maxlen])
        print("Number of sequences:", len(sentences))

        # Defining X and y
        x = np.zeros((len(sentences), maxlen, len(chars)))
        y = np.zeros((len(sentences), maxlen, len(chars)))
        for i, sentence in enumerate(sentences):
            for t, char in enumerate(sentence):
                x[i, t, char_indices[char]] = 1
                y[i, t, char_indices[next_seq[i][t]]] = 1

        self.text = text
        self.chars = chars
        self.char_indices = char_indices
        self.indices_char = indices_char
        self.sentences = sentences
        self.next_chars = next_seq
        self.X = x
        self.Y = y

    def sample(self, preds, temperature=1.0):
        # helper function to sample an index from a probability array
        preds = np.asarray(preds).astype("float64")
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, preds, 1)
        return np.argmax(probas)

    def generate(self, model, quote_len=40, sentence=False, temperature=1.0, verbose=False, fake=False):
        
        if verbose:
            print("...Temperature:", temperature)

        generated = ""
        if fake == True:
            sentence = random.choices(self.chars, k=self.maxlen)
            sentence = ''.join(sentence)
        elif sentence == False:
            start_index = random.randint(0, len(self.text) - self.maxlen - 1)
            sentence = self.text[start_index : start_index + self.maxlen]
        if verbose:
            print('...Generating with seed: "' + sentence + '"')

        for i in range(quote_len):
            x_pred = np.zeros((1, self.maxlen, len(self.chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, self.char_indices[char]] = 1.0 #One-Hot Array
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = self.sample(preds, temperature)
            next_char = self.indices_char[next_index]
            sentence = sentence[1:] + next_char
            generated += next_char

        if verbose:
            print("...Generated: ", generated)

        return generated

    def generate_fake_samples(self, model, n_samples, quote_len=40, temperature=1.0):

        x_random = []
        for n in range(n_samples):
            x_random.append(self.generate(model, quote_len=quote_len, temperature=temperature, fake=True))

        X_fake = np.zeros((len(x_random), self.maxlen, len(self.chars)))
        for i, sentence in enumerate(x_random):
            for t, char in enumerate(sentence):
                X_fake[i, t, self.char_indices[char]] = 1

        X_fake = X_true.reshape(X_fake.shape[0], X_fake.shape[1], X_fake.shape[2], 1)
        y_fake = np.zeros(n_samples)

        return X_fake, y_fake

    def generate_real_samples(self, n_samples):

        X_true = np.array(random.sample(list(self.X), n_samples))
        X_true = X_true.reshape(X_true.shape[0], X_true.shape[1], X_true.shape[2], 1)
        y_true = np.ones(n_samples)

        return X_true, y_true

    def generate_gan_samples(self, n_samples):

        sample_list = random.sample(range(len(self.X)), n_samples)

        X_gan = self.X[sample_list]
        y_gan = self.Y[sample_list]

        return X_gan, y_gan


In [16]:
cp = CharacterPreprocessor(df)
cp.preprocess_type2()

Corpus length: 630844
Total chars: 52
Number of sequences: 210255


In [11]:
cp.X.shape

(210268, 40, 52)

In [15]:
X_g, y_g = cp.generate_gan_samples(128)

In [16]:
print(X_g.shape)
y_g.shape

(128, 40, 52)


(128, 40, 52)

In [31]:
model = Sequential([
    layers.InputLayer(input_shape=(cp.maxlen, len(cp.chars))),
    layers.LSTM(128),
    layers.Dense(len(cp.chars), activation='softmax'),
])

cp.preprocess()
model.summary()

Corpus length: 630844
Total chars: 52
Number of sequences: 210268
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 128)               92672     
_________________________________________________________________
dense_4 (Dense)              (None, 52)                6708      
Total params: 99,380
Trainable params: 99,380
Non-trainable params: 0
_________________________________________________________________


In [18]:
model = Sequential([
    layers.InputLayer(input_shape=(cp.maxlen, len(cp.chars))),
    layers.LSTM(128, return_sequences=True),
    layers.Dense(len(cp.chars), activation='softmax'),
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 128)           92672     
_________________________________________________________________
dense (Dense)                (None, 40, 52)            6708      
Total params: 99,380
Trainable params: 99,380
Non-trainable params: 0
_________________________________________________________________


In [19]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [20]:
gc.collect()

7889

In [47]:
cp.Y.shape

(210268, 40, 52)

In [21]:
epochs = 10
batch_size = 128

model.fit(cp.X, cp.Y, batch_size=128, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x19e8283b888>

In [22]:
model.save(root + os.sep + 'models' + os.sep + 'Type2_Char_Quote_Generator.h5')

In [26]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [24]:
generated = ""
start_index = random.randint(0, len(cp.text) - cp.maxlen - 1)
sentence = cp.text[start_index : start_index + cp.maxlen]
print('...Generating with seed: "' + sentence + '"')

x_pred = np.zeros((1, cp.maxlen, len(cp.chars)))
for t, char in enumerate(sentence):
    x_pred[0, t, cp.char_indices[char]] = 1.0 #One-Hot Array
preds = model.predict(x_pred, verbose=0)[0]
for pred in list(preds):
    next_index = sample(pred, temperature=1)
    next_char = cp.indices_char[next_index]
    generated += next_char
#next_index = cp.sample(preds, temperature=0.2)
#next_char = cp.indices_char[next_index]
#sentence = sentence[1:] + next_char
#generated += next_char

#print("...Generated: ", generated)

...Generating with seed: "t, let's get one thing straight, here. a"


In [90]:
result = []
for pred in list(preds):
    next_index = sample(pred, temperature=1)
    next_char = cp.indices_char[next_index]
    result.append(next_char)

result = ''.join(result)
result

'savsgcd ildrah eowrtseiho b;i s,sueih h '

In [17]:
result = []
for pred in list(cp.Y[0]):
    next_index = np.argmax(pred)
    next_char = cp.indices_char[next_index]
    result.append(next_char)

result = ''.join(result)
result

"ing you out there, before. i've never se"

In [8]:
cp.Y[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [9]:
cp.Y[0][1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [59]:
preds[0]

array([1.75574094e-01, 4.01515700e-03, 1.26093603e-03, 4.39963519e-27,
       1.96970836e-30, 3.39793024e-29, 1.08346827e-02, 1.17139556e-04,
       1.14109237e-02, 1.95014698e-03, 1.88446064e-02, 1.05222636e-10,
       3.12063958e-06, 5.26225143e-08, 8.95640284e-10, 2.78055761e-20,
       9.83004698e-17, 2.35092803e-17, 9.16618910e-22, 3.24517283e-22,
       1.31721133e-19, 4.18913763e-20, 1.02648273e-05, 1.06867283e-05,
       3.44924792e-03, 6.00378588e-02, 1.27716353e-02, 1.69684682e-02,
       3.01180761e-02, 8.82273018e-02, 1.30073987e-02, 1.59521420e-02,
       4.09088545e-02, 6.39058277e-02, 2.70903995e-03, 1.20395636e-02,
       2.90082805e-02, 2.13611498e-02, 5.19057401e-02, 6.27867877e-02,
       1.32876979e-02, 4.96344583e-05, 3.95205058e-02, 4.29641232e-02,
       6.88053966e-02, 3.01229022e-02, 9.36412532e-03, 2.11049579e-02,
       7.69408187e-04, 2.44409349e-02, 3.81136080e-04, 1.31913117e-28],
      dtype=float32)

In [49]:
cp.generate(model)



ValueError: object too deep for desired array

### 1.2 Generating Fake

In [6]:
from tensorflow import keras
from tensorflow.keras import layers

generator = keras.Sequential([
    keras.layers.InputLayer(input_shape=(cp.maxlen, len(cp.chars))),
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(100),
    layers.Dropout(0.3),
    layers.Dense(100, activation='relu'),
    layers.Dense(len(cp.chars), activation="softmax")
])

In [16]:
len(cp.sentences)

210268

In [9]:
cp.sentences

40

In [10]:
input_list = [cp.generate(generator, quote_len=40, sentence=s, verbose=False) for s in cp.sentences[:20]]

In [11]:
len(input_list)

20

In [18]:
input_list[0]

'HaSuA8kBJjBu9NcbZ0WoQmoZKvSWHZiY.Ms G*qg'

In [21]:
sequence = random.choices(cp.chars, k=40)
''.join(sequence)

'$jvbr}r."acq2i&oiju.gc6; mi\'m"v3b"sjt8 !'

In [22]:
for i in range(40):
    x_pred = np.zeros((1, cp.maxlen, len(cp.chars)))
    for t, char in enumerate(sequence):
        x_pred[0, t, cp.char_indices[char]] = 1.0 #One-Hot Array
    

In [23]:
x_pred

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.]]])

In [7]:
X_fake, y_fake = cp.generate_fake_samples(generator, 20)

In [8]:
X_fake.shape

(20, 40, 52)

In [56]:
X_fake[0]

array(["%,n-57-b*'vnf0t,:jg2s%hme/xqnrum j-,n y!"], dtype='<U40')

In [43]:
np.array([1, 2, 3]).shape

(3,)

## 2. Word-Level Preprocessing

In [8]:
import pandas as pd
import numpy as np
import random
import string
import os
import re

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [2]:
root = os.path.dirname(os.getcwd())
df = pd.read_csv(root + os.sep + 'data'+ os.sep + 'BASE.csv')

In [37]:
word_model = tf.keras.models.load_model(root + os.sep + 'models' + os.sep + 'Word_Base_Quote_Generator.h5')

In [33]:
class WordPreprocessor:

    def __init__(self, df):
        self.data = df
        self.text = ''

    def get_tokens(self, quote_list):
        # Corpus
        text = ''
        for q in quote_list:
            text += ' ' + q
        print("Corpus length:", len(text))

        lowercase = text.lower()
        corpus = re.sub('<br />', ' ', lowercase)

        # replace '--' with a space ' '
        doc = corpus.replace('--', ' ')
        # split into tokens by white space
        tokens = doc.split()
        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        # remove '' strings
        pops = [[seq.pop(i) for i, w in enumerate(seq) if w == ''] for seq in tokens]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]

        self.text = corpus
        self.tokens = tokens

        # Mostramos datos
        print('Total Tokens: %d' % len(self.tokens))
        print('Unique Tokens: %d' % len(set(self.tokens)))

        return corpus

    def preprocess(self, maxlen=50, column='quote'):
        # Quote List
        quotes = list(self.data[column])

        # Limpiamos corpus
        self.get_tokens(quotes)

        # Organizamos en secuencias de tokens
        maxlen = maxlen + 1
        sequences = []
        for i in range(maxlen, len(self.tokens)):
            # select sequence of tokens
            seq = self.tokens[i-maxlen:i]
            # convert into a line
            line = ' '.join(seq)
            # store
            sequences.append(line)
        print('Total Sequences: %d' % len(sequences))

        # Encodeamos a enteros las secuencias
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(sequences)
        sequences = self.tokenizer.texts_to_sequences(sequences)

        # vocabulary size
        self.vocab_size = len(self.tokenizer.word_index) + 1

        # Dividimos en input(X) y output(y)
        self.sequences = np.array(sequences)
        X, y = self.sequences[:,:-1], self.sequences[:,-1]
        y = to_categorical(y, num_classes=self.vocab_size)
        self.seq_length = X.shape[1]

        self.X = X
        self.Y = y
        return X, y

    # generate a sequence from a language model
    def generate(self, model, n_words=50, seed_text=None):

        if seed_text == None:
            # select a seed text
            seed_text = str(self.sequences[random.randint(0,len(self.sequences))]) + '\n'

        result = []
        # generate a fixed number of words
        for _ in range(n_words):
            # encode the text as integer
            encoded = self.tokenizer.texts_to_sequences([seed_text])[0]
            # truncate sequences to a fixed length
            encoded = tf.keras.preprocessing.sequence.pad_sequences([encoded], maxlen=self.seq_length, truncating='pre')
            # predict probabilities for each word
            yhat = np.argmax(model.predict(encoded), axis=-1)
            # map predicted word index to word
            out_word = ''
            for word, index in self.tokenizer.word_index.items():
                if index == yhat:
                    out_word = word
                    break
            # append to input
            seed_text += ' ' + out_word
            result.append(out_word)
        return ' '.join(result)

In [34]:
wp = WordPreprocessor(df)

In [35]:
wp.preprocess()

Corpus length: 630844
Total Tokens: 118223
Unique Tokens: 11540
Total Sequences: 118172


(array([[  61, 5173,   45, ...,    9, 2031,   67],
        [5173,   45,   16, ..., 2031,   67,   10],
        [  45,   16,  553, ...,   67,   10,   48],
        ...,
        [  47,    2,   21, ...,   12,   29,    9],
        [   2,   21,   13, ...,   29,    9,   11],
        [  21,   13, 1719, ...,    9,   11,   25]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

In [38]:
wp.generate(word_model)

'and i dont know that i dont know it i dont know it i dont know it i dont know it i dont know it i dont know it i dont know it i dont know it i dont know it i dont know it i dont know it i'

## 3. Word Vector Plot

In [109]:
from gensim.models import Word2Vec

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px

In [42]:
sentences = wp.text.split('.')

In [49]:
sequences = [sentence.split(' ') for sentence in sentences]

In [115]:
table = str.maketrans('', '', string.punctuation)
tokens = [[w.translate(table) for w in seq] for seq in sequences]
pops = [[seq.pop(i) for i, w in enumerate(seq) if w == ''] for seq in tokens]

In [93]:
tokens[:2]

[array([-0.32917568,  0.4133711 ,  0.19059797,  0.06240485,  0.13121586,
        -0.8111169 ,  0.40069413,  1.2156549 , -0.31494632, -0.39080057,
        -0.38883063, -0.9409281 , -0.0849929 ,  0.27793258,  0.04217121,
        -0.43816704, -0.02406088, -0.26127818, -0.4655731 , -1.2394025 ,
         0.5435719 ,  0.61098295,  0.41150102, -0.37213793, -0.23626378,
         0.05704324, -0.5501416 , -0.35497278, -0.59228754,  0.07711013,
         0.3629901 , -0.00774501,  0.26193625, -0.61758673, -0.54816204,
         0.7799533 ,  0.12924875, -0.37494263, -0.31273797, -0.9766742 ,
         0.04834285,  0.20193459, -0.38018793, -0.17973202,  0.6566101 ,
        -0.1896423 , -0.16328776, -0.38356903, -0.02512395,  0.0672423 ,
         0.3450995 , -0.5016849 , -0.04237203, -0.26588896, -0.17360955,
         0.06308721,  0.04897323, -0.30443493, -0.5145191 , -0.02699853,
         0.0806064 , -0.06927129,  0.08009045, -0.16875458, -0.3187564 ,
         0.45346326, -0.15299751,  0.28496414, -0.8

In [116]:
vectors = Word2Vec(tokens, vector_size=100, window=5, min_count=10, workers=4)

In [117]:
vectors.wv['who']

array([-0.16915053,  0.33932525, -0.06520323, -0.04685494, -0.06620958,
       -0.60725474,  0.10987974,  0.631523  , -0.27924857, -0.20766371,
       -0.20678729, -0.41190416,  0.08226023,  0.00072245,  0.04420877,
       -0.16993381,  0.12080246, -0.25645006, -0.06339958, -0.6124828 ,
        0.18666248,  0.21706651,  0.16561772, -0.20755355, -0.156732  ,
        0.02576791, -0.11864272, -0.22139853, -0.26185346,  0.03557435,
        0.29804537,  0.00563767,  0.06865207, -0.37825334, -0.2519085 ,
        0.41119003,  0.15226403, -0.11793939, -0.10111231, -0.5097096 ,
        0.15163425, -0.2277898 , -0.11892404,  0.00526371,  0.3136848 ,
       -0.16468586, -0.24000837, -0.17820786,  0.15605664,  0.10120876,
        0.15938532, -0.25181064, -0.00802472, -0.06709524, -0.27767137,
        0.21111108,  0.17956844, -0.07643602, -0.31522962,  0.02430836,
        0.2105694 , -0.0496785 ,  0.21618757, -0.01288242, -0.4010014 ,
        0.33447933,  0.11026956,  0.3255466 , -0.51812255,  0.31

In [118]:
#Creates and TSNE model
labels = []
vecs = []

for word in vectors.wv.index_to_key:
    vecs.append(vectors.wv[word])
    labels.append(word)


In [119]:

tsne_model = TSNE(n_components=3, n_iter=2500, random_state=42)
new_values = tsne_model.fit_transform(vecs)

In [120]:
pca_model = PCA(n_components=3, random_state=42)
pca_values = pca_model.fit_transform(vecs)

In [96]:
len(vectors.wv.index_to_key)

1084

In [121]:
x = []
y = []
z = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
    z.append(value[2])

In [122]:
x = []
y = []
z = []
for value in pca_values:
    x.append(value[0])
    y.append(value[1])
    z.append(value[2])

In [99]:
len(x)

1084

In [123]:
fig = px.scatter_3d(x=x, y=y, z=z, labels=labels)
fig.show()

In [124]:
vectors.wv.most_similar('ring')

[('down', 0.9965344667434692),
 ('into', 0.9965146780014038),
 ('same', 0.9964144229888916),
 ('whole', 0.9963569641113281),
 ('days', 0.9963396787643433),
 ('room', 0.996309220790863),
 ('all', 0.996243953704834),
 ('great', 0.9962398409843445),
 ('business', 0.9961745738983154),
 ('other', 0.9961439371109009)]