In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
from scipy.sparse import lil_matrix
from tensorflow.keras.preprocessing.sequence import make_sampling_table, skipgrams
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import normalize

## Parte 1: Matriz Documento-Termino

In [2]:
train = fetch_20newsgroups(subset="train",remove=('headers', 'footers', 'quotes'))
train.data[0]

'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [3]:
tfidf_vec = TfidfVectorizer(max_df=0.90, min_df=2, max_features=10000, stop_words='english', ngram_range=(1, 2))

In [4]:
tfidf_matrix = tfidf_vec.fit_transform(train.data)
tfidf_matrix.shape

(11314, 10000)

In [5]:
count_vec = CountVectorizer(max_df=0.90, min_df=2, max_features=10000, stop_words='english', ngram_range=(1, 2))

In [6]:
count_matrix = count_vec.fit_transform(train.data)
count_matrix.shape

(11314, 10000)

In [7]:
cosine_similarity(tfidf_matrix)

array([[1.        , 0.01150709, 0.05920101, ..., 0.        , 0.        ,
        0.02811005],
       [0.01150709, 1.        , 0.03243869, ..., 0.11755287, 0.        ,
        0.02045994],
       [0.05920101, 0.03243869, 1.        , ..., 0.00466065, 0.01281438,
        0.00501411],
       ...,
       [0.        , 0.11755287, 0.00466065, ..., 1.        , 0.00852721,
        0.        ],
       [0.        , 0.        , 0.01281438, ..., 0.00852721, 1.        ,
        0.        ],
       [0.02811005, 0.02045994, 0.00501411, ..., 0.        , 0.        ,
        1.        ]])

In [8]:
cosine_similarity(count_matrix)

array([[1.        , 0.02037848, 0.09448591, ..., 0.        , 0.        ,
        0.02577696],
       [0.02037848, 1.        , 0.05519707, ..., 0.11626704, 0.        ,
        0.0451754 ],
       [0.09448591, 0.05519707, 1.        , ..., 0.01996587, 0.01301007,
        0.01163656],
       ...,
       [0.        , 0.11626704, 0.01996587, ..., 1.        , 0.0182696 ,
        0.        ],
       [0.        , 0.        , 0.01301007, ..., 0.0182696 , 1.        ,
        0.        ],
       [0.02577696, 0.0451754 , 0.01163656, ..., 0.        , 0.        ,
        1.        ]])

In [9]:
doc0_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix)
doc0_sim

array([[1.        , 0.01150709, 0.05920101, ..., 0.        , 0.        ,
        0.02811005]])

In [10]:
indices = np.where(doc0_sim > 0.2)
indices

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([    0,   596,   763,   771,  1082,  1224,  2086,  2519,  2554,
         3112,  3819,  5282,  5308,  5317,  5553,  6055,  6330,  6418,
         6510,  6581,  6627,  6997,  7807,  7861,  7993,  8013,  8266,
         8364,  8491,  8536,  8540,  8863,  8924,  9629,  9767, 10024,
        10209, 11077, 11208]))

In [11]:
train.data[1082]

"Anybody got any good/bad experience with selling their car through one of\nthose car hunters?  I'm selling a 1991 Dodge Stealth R/T and I was contacted\nby this company called the Markham group based out of Illinois.  \n\nThey said they have 7-10 buyers in my area interested in my car or they wouldn't\nbe talking to me.  They talked to me for a good 20 minutes asking everything\nabout my car and said they could sell it no problem.  They guaranteed that if\nthey didn't sell my car in 75 days, I would get my money back ($389) and since\nI charged it, I'm protected by federal law which states that if I'm not satisfied,\nI would get a refund (which is true).  They federal expressed all the paperwork \nto me which had a contract stating their policy about the 75 days and such."

In [12]:
indices = np.where(doc0_sim > 0.2)
indices

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([    0,   596,   763,   771,  1082,  1224,  2086,  2519,  2554,
         3112,  3819,  5282,  5308,  5317,  5553,  6055,  6330,  6418,
         6510,  6581,  6627,  6997,  7807,  7861,  7993,  8013,  8266,
         8364,  8491,  8536,  8540,  8863,  8924,  9629,  9767, 10024,
        10209, 11077, 11208]))

## Parte 2: Truncated SVD

In [13]:
red = TruncatedSVD(n_components=500)

In [14]:
tfidf_matrix_red = red.fit_transform(tfidf_matrix)
tfidf_matrix_red

array([[ 0.13222296,  0.02096677, -0.00830601, ...,  0.03745807,
         0.00357795,  0.01301982],
       [ 0.07552046,  0.06374432, -0.00132505, ..., -0.00857671,
        -0.00754528,  0.01769943],
       [ 0.23253291,  0.06565105,  0.00099817, ..., -0.00645781,
        -0.00147018,  0.00680242],
       ...,
       [ 0.05881021,  0.03161327,  0.00197326, ...,  0.04457938,
         0.01787445, -0.02612619],
       [ 0.07695433, -0.02393144, -0.00712309, ...,  0.02720137,
         0.02947992, -0.0183239 ],
       [ 0.06901427,  0.02159481, -0.00194965, ...,  0.00395454,
         0.01518472, -0.00903044]])

In [15]:
cosine_similarity(tfidf_matrix_red)

array([[ 1.        ,  0.03768581,  0.15346158, ...,  0.00338708,
        -0.00879139,  0.08255598],
       [ 0.03768581,  1.        ,  0.14980072, ...,  0.31093065,
         0.03880275,  0.07688204],
       [ 0.15346158,  0.14980072,  1.        , ...,  0.07396756,
         0.00832134,  0.01869046],
       ...,
       [ 0.00338708,  0.31093065,  0.07396756, ...,  1.        ,
         0.02885778, -0.01277499],
       [-0.00879139,  0.03880275,  0.00832134, ...,  0.02885778,
         1.        , -0.01420361],
       [ 0.08255598,  0.07688204,  0.01869046, ..., -0.01277499,
        -0.01420361,  1.        ]])

In [16]:
doc0_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix)
indices = np.where(doc0_sim > 0.2)
indices

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([    0,   596,   763,   771,  1082,  1224,  2086,  2519,  2554,
         3112,  3819,  5282,  5308,  5317,  5553,  6055,  6330,  6418,
         6510,  6581,  6627,  6997,  7807,  7861,  7993,  8013,  8266,
         8364,  8491,  8536,  8540,  8863,  8924,  9629,  9767, 10024,
        10209, 11077, 11208]))

## Parte 3: Matriz Termino-Documento

In [17]:
matTranspose = red.fit_transform(tfidf_matrix.T)

In [18]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")
neigh.fit(matTranspose)

In [19]:
idx_1 = tfidf_vec.vocabulary_["engineer"]
_, neig = neigh.kneighbors(matTranspose[idx_1:idx_1+1])

In [20]:
for n in neig[0]:
    print(tfidf_vec.get_feature_names_out()[n])

engineer
computer science
computer
pasadena
retired
don work
jpl nasa
kgb
818
reverse
damned
engineers
electrical
jpl
lawrence
office
cs
electricity
applelink
hardware software


In [21]:
idx_1 = tfidf_vec.vocabulary_["chemical"]
_, neig = neigh.kneighbors(matTranspose[idx_1:idx_1+1])

In [22]:
for n in neig[0]:
    print(tfidf_vec.get_feature_names_out()[n])

chemical
weapons
nuclear weapons
weapon
biological
automatic weapons
semi automatic
chemicals
destruction
nuclear
reactions
mass
bomb
amounts
iranian
retired
2923
ban
explosive
automatic


In [31]:
idx_1 = tfidf_vec.vocabulary_["master"]
idx_2 = tfidf_vec.vocabulary_["engineer"]
idx_3 = tfidf_vec.vocabulary_["student"]

In [44]:
vector = matTranspose[idx_2]-matTranspose[idx_1]+matTranspose[idx_3]
_, neig = neigh.kneighbors([vector])
for n in neig[0]:
    print(tfidf_vec.get_feature_names_out()[n])

student
technician
engineer
dr
grad
dod
jd
ubc
ubc ca
ee
engineering
just say
santa barbara
ca
logically
graduate
lets
tale
barbara
mcgill


## Parte 4: Matriz Termino-Clase

In [33]:
no_topics = 20

nmf = NMF(n_components=no_topics, random_state=1, alpha_W=0.5,
          l1_ratio=1, init='nndsvd', beta_loss="frobenius", max_iter=1000).fit(tfidf_matrix)



In [34]:
tfidf_feature_names = tfidf_vec.get_feature_names_out()

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(nmf, tfidf_feature_names, no_topics)

Topic 0:
don just like know people think does good use time god ve thanks new make way want say right need
Topic 1:
windows thanks card drive dos file mail advance use files software pc thanks advance program hi scsi video disk window looking
Topic 2:
geb pitt edu gordon banks cadre dsl geb cadre dsl shameful surrender edu shameful dsl pitt surrender soon intellect geb chastity n3jxp skepticism skepticism chastity banks n3jxp n3jxp chastity intellect cadre shameful pitt
Topic 3:
god jesus does windows thanks bible faith christ christians christian believe know advance thanks advance file files church dos hi does know
Topic 4:
key chip encryption government clipper keys use public file law escrow people program algorithm clipper chip security information window nsa secure
Topic 5:
drive scsi key chip disk drives hard ide controller card hard drive floppy bus encryption clipper god government bit use keys
Topic 6:
thanks does mail know thanks advance advance does know email drive address

In [35]:
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,random_state=0).fit(count_matrix)

count_feature_names = count_vec.get_feature_names_out()

display_topics(lda, count_feature_names, no_topics)

Topic 0:
10 25 12 11 16 15 13 14 17 20 18 24 000 55 21 19 26 27 23 22
Topic 1:
thanks edu mail know advance does looking help email hi post send appreciated anybody info thanks advance address information reply like
Topic 2:
president mr stephanopoulos mr stephanopoulos states new april united health 1993 national united states american washington press program congress jobs administration house
Topic 3:
disease patients medical pain doctor edu gordon lib pitt treatment banks blood soon surrender patient skepticism pitt edu gordon banks diseases intellect
Topic 4:
cd sound new radio sale offer auto unit channel price brand shipping tape audio asking radar amp best offer stereo tv
Topic 5:
com runs hit ground lost home neutral wire cubs dog ball run won pitching san dave braves bob milwaukee average
Topic 6:
drive use card disk scsi bit problem like does hard work need using just used mac memory speed drives video
Topic 7:
don just people like think know time good say make ve way going 

In [36]:
mbnmf = MiniBatchNMF(
    n_components=no_topics,
    random_state=1,
    batch_size=32,
    init='nndsvd',
    beta_loss="frobenius",
    alpha_W=0.00005,
    alpha_H=0.00005,
    l1_ratio=0.5,
).fit(tfidf_matrix)

display_topics(nmf, tfidf_feature_names, no_topics)

Topic 0:
don just like know people think does good use time god ve thanks new make way want say right need
Topic 1:
windows thanks card drive dos file mail advance use files software pc thanks advance program hi scsi video disk window looking
Topic 2:
geb pitt edu gordon banks cadre dsl geb cadre dsl shameful surrender edu shameful dsl pitt surrender soon intellect geb chastity n3jxp skepticism skepticism chastity banks n3jxp n3jxp chastity intellect cadre shameful pitt
Topic 3:
god jesus does windows thanks bible faith christ christians christian believe know advance thanks advance file files church dos hi does know
Topic 4:
key chip encryption government clipper keys use public file law escrow people program algorithm clipper chip security information window nsa secure
Topic 5:
drive scsi key chip disk drives hard ide controller card hard drive floppy bus encryption clipper god government bit use keys
Topic 6:
thanks does mail know thanks advance advance does know email drive address

In [37]:
TNG_topics_nmf = nmf.transform(tfidf_matrix)
TNG_topics_lda = lda.transform(count_matrix)

In [38]:
count_docs=[]
for i in range(20):
    count_docs.append({label: 0 for label in train.target_names})
for label,topic in zip(train.target,TNG_topics_nmf.argmax(axis=-1)):
    count_docs[topic][train.target_names[label]] +=1

In [39]:
count_docs[0]

{'alt.atheism': 480,
 'comp.graphics': 584,
 'comp.os.ms-windows.misc': 591,
 'comp.sys.ibm.pc.hardware': 590,
 'comp.sys.mac.hardware': 578,
 'comp.windows.x': 593,
 'misc.forsale': 585,
 'rec.autos': 594,
 'rec.motorcycles': 598,
 'rec.sport.baseball': 597,
 'rec.sport.hockey': 600,
 'sci.crypt': 595,
 'sci.electronics': 591,
 'sci.med': 594,
 'sci.space': 593,
 'soc.religion.christian': 599,
 'talk.politics.guns': 546,
 'talk.politics.mideast': 564,
 'talk.politics.misc': 465,
 'talk.religion.misc': 377}

In [40]:
neigh_nmf = NearestNeighbors(n_neighbors=20, metric="cosine")

In [41]:
neigh_nmf.fit(TNG_topics_nmf)

In [42]:
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(train.data)

In [43]:
idx_1 = tokenizer.word_index["space"]
_, neig = neigh_nmf.kneighbors(TNG_topics_nmf[idx_1:idx_1+1])

for n in neig[0]:
    print(tokenizer.index_word[n])

scenes
pentium
denies
illusion
bone
spine
50mhz
acs
incomplete
headaches
airport
slavery
marijuana
placebo
polish
governmental
attacker
moderate
maintains
gate


In [46]:
idx_1 = tokenizer.word_index["master"]
idx_2 = tokenizer.word_index["engineer"]
idx_3 = tokenizer.word_index["student"]

In [48]:
vector = TNG_topics_nmf[idx_2]-TNG_topics_nmf[idx_1]+TNG_topics_nmf[idx_3]
_, neig = neigh_nmf.kneighbors([vector])
for n in neig[0]:
    print(tokenizer.index_word[n])

scenes
pentium
denies
illusion
bone
spine
50mhz
acs
incomplete
headaches
airport
slavery
marijuana
placebo
polish
governmental
attacker
moderate
maintains
gate


## Parte 5: Matriz Termino-Termino

In [85]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train.data)

In [86]:
seqs=tokenizer.texts_to_sequences(train.data)
seqs=np.hstack(seqs)
seqs=seqs.astype(int)
seqs

array([   7,   26, 1383, ...,  181,   41, 4767])

In [87]:
V = 5001    #ventana chica para evitar problemas de memoria ram al armar la matriz PPMI
counts_matrix=lil_matrix((V-1, V-1))
pairs, labels = skipgrams(sequence=list(seqs), vocabulary_size=V, window_size=5, negative_samples=0, sampling_table=make_sampling_table(V, sampling_factor=1), shuffle=False)
pairs_u, counts = np.unique(pairs,return_counts=True, axis=0)
for num,(pair,count) in enumerate(zip(pairs_u, counts)):
    if num%1000 ==0:
        print(f'\r{num}', end="")
    counts_matrix[pair[0],pair[1]] += count

2932000

In [88]:
print(counts_matrix[0:10,0:10])

  (1, 1)	60865.0
  (1, 3)	31845.0
  (1, 4)	45114.0
  (1, 5)	17608.0
  (1, 6)	27569.0
  (1, 7)	13284.0
  (1, 8)	23722.0
  (1, 9)	20657.0
  (2, 2)	502554.0
  (2, 4)	22.0
  (2, 5)	444.0
  (2, 6)	2.0
  (2, 7)	386.0
  (2, 8)	8.0
  (3, 1)	31943.0
  (3, 3)	12320.0
  (3, 4)	10594.0
  (3, 5)	12475.0
  (3, 6)	11979.0
  (3, 7)	8836.0
  (3, 8)	7277.0
  (3, 9)	8518.0
  (4, 1)	45243.0
  (4, 2)	22.0
  (4, 3)	10594.0
  (4, 4)	11370.0
  (4, 5)	13610.0
  (4, 6)	11659.0
  (4, 7)	5125.0
  (4, 8)	8568.0
  (4, 9)	7841.0
  (5, 1)	17664.0
  (5, 2)	444.0
  (5, 3)	12475.0
  (5, 4)	13610.0
  (5, 5)	10786.0
  (5, 6)	9762.0
  (5, 7)	7708.0
  (5, 8)	7806.0
  (5, 9)	9461.0
  (6, 1)	27644.0
  (6, 2)	2.0
  (6, 3)	11979.0
  (6, 4)	11659.0
  (6, 5)	9762.0
  (6, 6)	8214.0
  (6, 7)	5822.0
  (6, 8)	7542.0
  (6, 9)	6073.0
  (7, 1)	13324.0
  (7, 2)	386.0
  (7, 3)	8836.0
  (7, 4)	5125.0
  (7, 5)	7708.0
  (7, 6)	5822.0
  (7, 7)	6454.0
  (7, 8)	4118.0
  (7, 9)	3539.0
  (8, 1)	23792.0
  (8, 2)	8.0
  (8, 3)	7277.0
  (8, 4)	8568.0

In [89]:
red = TruncatedSVD(n_components=500)
TNG_cv_red = red.fit_transform(counts_matrix)
TNG_cv_red.shape

(5000, 500)

In [90]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")
neigh.fit(counts_matrix)

In [91]:
idx_1 = tokenizer.word_index["friends"]
_, neig = neigh.kneighbors(counts_matrix[idx_1:idx_1+1])

for n in neig[0]:
    print(tokenizer.index_word[n])

friends
parents
personal
mother
own
others
them
experience
wife
some
opinion
life
opinions
faith
knowledge
a
every
all
home
brothers


In [92]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")
neigh.fit(TNG_cv_red)

In [93]:
idx_1 = tokenizer.word_index["friends"]
_, neig = neigh.kneighbors(TNG_cv_red[idx_1:idx_1+1])

for n in neig[0]:
    print(tokenizer.index_word[n])

friends
parents
personal
mother
wife
brothers
opinions
brother
experience
others
philosophy
own
opinion
fellow
faith
views
beliefs
them
life
neighbor


## Parte 6: Matriz PMI

In [45]:


tf_matrix = counts_matrix.toarray()

df_vector = np.sum(counts_matrix > 0, axis=1)

N = counts_matrix.shape[0]

idf_vector = np.log(N / (df_vector + 1))

print(np.diag(idf_vector))
print(tf_matrix.shape)

NameError: ignored

In [95]:
tfidf_matrix = tf_matrix * np.diag(idf_vector)[0]

In [96]:
total_co_occurrences = np.sum(counts_matrix)
pij_matrix = counts_matrix / total_co_occurrences

In [97]:
pi_vector = np.sum(counts_matrix, axis=1) / total_co_occurrences
pj_vector = np.sum(counts_matrix, axis=0) / total_co_occurrences

In [98]:
a = np.outer(pi_vector, pj_vector)

for i in a:
    for j in range(len(i)):
        if i[j] == 0:
            i[j] += 1e-8

a

array([[1.00000000e-08, 1.00000000e-08, 1.00000000e-08, ...,
        1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
       [1.00000000e-08, 2.75524307e-03, 1.60301068e-03, ...,
        1.06164792e-06, 1.06164792e-06, 1.06164792e-06],
       [1.00000000e-08, 1.60769907e-03, 9.35365309e-04, ...,
        6.19477242e-07, 6.19477242e-07, 6.19477242e-07],
       ...,
       [1.00000000e-08, 1.06475297e-06, 6.19477242e-07, ...,
        4.10269710e-10, 4.10269710e-10, 4.10269710e-10],
       [1.00000000e-08, 1.06475297e-06, 6.19477242e-07, ...,
        4.10269710e-10, 4.10269710e-10, 4.10269710e-10],
       [1.00000000e-08, 1.06475297e-06, 6.19477242e-07, ...,
        4.10269710e-10, 4.10269710e-10, 4.10269710e-10]])

In [99]:
pmi = (pij_matrix * N) / a

for i in pmi:
    for j in range(len(i)):
        if i[j, 0] == 0:
            i[j] += 1e-8
pmi

matrix([[1.00000000e-08, 1.00000000e-08, 1.00000000e-08, ...,
         1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
        [1.00000000e-08, 5.45668121e+03, 1.00000000e-08, ...,
         6.98009952e+02, 4.65339968e+03, 2.32669984e+02],
        [1.00000000e-08, 1.00000000e-08, 1.32715715e+05, ...,
         1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
        ...,
        [1.00000000e-08, 6.95974410e+02, 1.00000000e-08, ...,
         2.40830458e+06, 1.00000000e-08, 1.00000000e-08],
        [1.00000000e-08, 4.63982940e+03, 1.00000000e-08, ...,
         1.00000000e-08, 1.00000000e-08, 1.00000000e-08],
        [1.00000000e-08, 2.31991470e+02, 1.00000000e-08, ...,
         1.00000000e-08, 1.00000000e-08, 1.32456752e+07]])

In [100]:
ppmi_matrix = np.maximum(0, np.log(pmi))

In [101]:
ppmi_matrix = normalize(np.asarray(ppmi_matrix), norm='l2', axis=1)

In [102]:
ppmi_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01460424, 0.        , ..., 0.01111406, 0.01433397,
        0.00924943],
       [0.        , 0.        , 0.0697558 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.05412447, 0.        , ..., 0.12151113, 0.        ,
        0.        ],
       [0.        , 0.05140956, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.03802458, 0.        , ..., 0.        , 0.        ,
        0.11448619]])

In [103]:
red = TruncatedSVD(n_components=500)
ppmi_svd = red.fit_transform(ppmi_matrix)

In [104]:
neigh = NearestNeighbors(n_neighbors=20, metric="cosine")
neigh.fit(ppmi_svd)

In [106]:
idx_1 = tokenizer.word_index["friends"]
_, neig = neigh.kneighbors(ppmi_svd[idx_1:idx_1+1])

for n in neig[0]:
    print(tokenizer.index_word[n])

friends
told
family
asked
coming
happened
tell
her
heard
ever
already
come
remember
ask
perhaps
jews
let
few
anyway
myself
