In [None]:
import numpy as np
from gensim.test.utils import common_texts
from gensim.models import word2vec, FastText
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## 0. Bag Of Words

not using libraries:

In [None]:
def bag_of_words(sentences, vocab=None):
    """
    creates a bag of words embedding matrix:

    sentences is a list of sentences to analyze
    
    vocab is a list of the vocabulary words to use for the analysis
    
    If None, all words within sentences should be used
    
    Returns: embeddings, features
        embeddings is a numpy.ndarray of shape (s, f) containing the embeddings
            s is the number of sentences in sentences
            f is the number of features analyzed
        features is a list of the features used for embeddings
    """
    uniq = set()
    words = []
    count = []
    removes = ["!", ",", ".", "'"]
    for i, sentence in enumerate(sentences):
      words.append([])
      for word in sentence.split(" "):
        if word[-2:] == "'s":
          word = word[:-2]
        if any(x in word for x in removes):
          for character in removes:
	          word = word.replace(character, "")
        if (not vocab) or (vocab and word.lower() in vocab):
            words[i].append(word.lower())
            uniq.add(word.lower())
    uniq = (sorted(uniq))
    # print(words)
    # print(uniq)
    for i, sent in enumerate(words):
      count.append([])
      for wrd in uniq:
        count[i].append(sent.count(wrd))
    # print(count)

    return np.array(count), uniq


using libraries:

In [None]:
def bag_of_words(sentences, vocab=None):
    """creates a bag of words embedding matrix"""
    vectorizer = CountVectorizer()
    newArray = vectorizer.fit_transform(sentences)
    embeddings = newArray.toarray()
    features = vectorizer.get_feature_names_out()

    return embeddings, features

In [None]:
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["holberton", "cake"]
E, F = bag_of_words(sentences, vocab)
print(E)
print(F)


##############

# [[0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
#  [0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
#  [0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
#  [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
#  [1 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0]
#  [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1]
#  [0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1]
#  [0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
# ['are', 'awesome', 'beautiful', 'cake', 'children', 'future', 'good', 'grandchildren', 'holberton', 'is', 'learning', 'life', 'machine', 'nlp', 'no', 'not', 'one', 'our', 'said', 'school', 'that', 'the', 'very', 'was']

[[0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]]
['cake', 'holberton']


## 1. TF-IDF

not using libraries

In [None]:
# t — term (word)
# d — document (set of words)
# N — count of corpus
# corpus — the total document set


# tf-idf(t, d) = tf(t, d) * log(N/(df + 1))

# Log(number of sentences / number of sentences word occurs in)

def tf_idf(sentences, vocab=None):
    """
    sentences is a list of sentences to analyze

    vocab is a list of the vocabulary words to use for the analysis

    If None, all words within sentences should be used
    
    Returns: embeddings, features
        embeddings is a numpy.ndarray of shape (s, f) containing the embeddings
            s is the number of sentences in sentences
            f is the number of features analyzed
        features is a list of the features used for embeddings
    """

    words = []
    if vocab:
      embeddings = vocab
    else:
      embeddings = set()
    removes = ["!", ",", ".", "'"]
    for i, sentence in enumerate(sentences):
      words.append([])
      for word in sentence.split(" "):
        if word[-2:] == "'s":
          word = word[:-2]
        if any(x in word for x in removes):
          for character in removes:
            word = word.replace(character, "")
        if vocab is None:
          embeddings.add(word.lower())
        words[i].append(word.lower())
    if vocab is None:
      embeddings = (sorted(embeddings))
    print(embeddings)
    print(words)
    print(tf(embeddings, words))
    print(idf(embeddings, words))

def tf(t, d):
  """tf(t,d) = count of t in d / number of words in d"""
  term_freq = []
  for i, sentence in enumerate(d):
    term_freq.append([])
    for word in [x for x in sentence if x in set(sentence)]:
      if word in set(t).intersection(sentence):
        term_freq[i].append(sentence.count(word) / len(set(t).intersection(sentence)))
      else:
        term_freq[i].append(0)
  return term_freq

def idf(t, d):
    """
    df(t) = occurrence of t in N documents
    idf(t) = N/df
    idf(t) = log(N/(df + 1))
    """
    count = {}
    for sentence in d:
      for word in set(sentence):
        if count.get(word) is None:
          count[word] = 1
        else:
          count[word] += 1

    inv_doc_freq = []
    for i, sentence in enumerate(d):
      inv_doc_freq.append([])
      for word in sentence:
        if word in t:
          inv_doc_freq[i].append(np.log10(len(sentence) / count[word] + 1))
        else:
          inv_doc_freq[i].append(0)
    return inv_doc_freq




using libraries

In [None]:
def tf_idf(sentences, vocab=None):
    """creates a TF-IDF embedding"""
    tfidfVectorizer = TfidfVectorizer(vocabulary=vocab)
    newArray = tfidfVectorizer.fit_transform(sentences)
    embeddings = newArray.toarray()
    features = tfidfVectorizer.get_feature_names_out()
    return embeddings, features

In [None]:
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]

vocab = ["awesome", "learning", "children", "cake", "good", "none", "machine"]
E, F = tf_idf(sentences, vocab)
print(E)
print(F)


# [[1.         0.         0.         0.         0.         0.         0.        ]
#  [0.5098139  0.60831315 0.         0.         0.         0.         0.60831315]
#  [0.         0.         0.         0.         0.         0.         0.        ]
#  [0.         0.         1.         0.         0.         0.         0.        ]
#  [0.         0.         1.         0.         0.         0.         0.        ]
#  [0.         0.         0.         0.70710678 0.70710678 0.         0.        ]
#  [0.         0.         0.         0.70710678 0.70710678 0.         0.        ]
#. [0.         0.         0.         0.         0.         0.         0.        ]]

# ['awesome', 'learning', 'children', 'cake', 'good', 'none', 'machine']

['awesome', 'learning', 'children', 'cake', 'good', 'none', 'machine']
[['holberton', 'school', 'is', 'awesome'], ['machine', 'learning', 'is', 'awesome'], ['nlp', 'is', 'the', 'future'], ['the', 'children', 'are', 'our', 'future'], ['our', 'children', 'children', 'are', 'our', 'grandchildren'], ['the', 'cake', 'was', 'not', 'very', 'good'], ['no', 'one', 'said', 'that', 'the', 'cake', 'was', 'not', 'very', 'good'], ['life', 'is', 'beautiful']]
[[0, 0, 0, 1.0], [0.3333333333333333, 0.3333333333333333, 0, 0.3333333333333333], [0, 0, 0, 0], [0, 1.0, 0, 0, 0], [0, 2.0, 2.0, 0, 0, 0], [0, 0.5, 0, 0, 0, 0.5], [0, 0, 0, 0, 0, 0.5, 0, 0, 0, 0.5], [0, 0, 0]]
[[0, 0, 0, 0.47712125471966244], [0.6989700043360189, 0.6989700043360189, 0, 0.47712125471966244], [0, 0, 0, 0], [0, 0.5440680443502757, 0, 0, 0], [0, 0.6020599913279624, 0.6020599913279624, 0, 0, 0], [0, 0.6020599913279624, 0, 0, 0, 0.6020599913279624], [0, 0, 0, 0, 0, 0.7781512503836436, 0, 0, 0, 0.7781512503836436], [0, 0, 0]]


TypeError: ignored

## 2. Train Word2Vec

In [None]:
def word2vec_model(sentences, size=100, min_count=5, window=5, negative=5, cbow=True, iterations=5, seed=0, workers=1):
    """
    creates and trains a gensim word2vec model:

    sentences is a list of sentences to be trained on
    size is the dimensionality of the embedding layer
    min_count is the minimum number of occurrences of a word for use in training
    window is the maximum distance between the current and predicted word within a sentence
    negative is the size of negative sampling
    cbow is a boolean to determine the training type; True is for CBOW; False is for Skip-gram
    iterations is the number of iterations to train over
    seed is the seed for the random number generator
    workers is the number of worker threads to train the model
    
    Returns: the trained model
    """
    return Word2Vec(sentences=sentences, size=size, min_count=min_count, window=window, negative=negative, sg=(not cbow), iter=iterations, seed=seed, workers=workers)
    

print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [None]:
print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(w2v.wv["computer"])

"""

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]

[-3.0043968e-03  1.5343886e-03  4.0832465e-03  3.7239199e-03
  4.9583608e-04  4.8461729e-03 -1.0620747e-03  8.2803884e-04
  9.7367732e-04 -6.7797926e-05 -1.5526683e-03  1.8058836e-03
 -4.3851901e-03  4.7258494e-04  2.8616134e-03 -2.2246949e-03
  2.7494587e-03 -3.5267104e-03  3.0259083e-03  2.7240592e-03
  2.6110576e-03 -4.5409841e-03  4.9135066e-03  8.2884904e-04
  2.7018311e-03  1.5654180e-03 -1.5859824e-03  9.3057036e-04
  3.7275942e-03 -3.6502020e-03  2.8285771e-03 -4.2384453e-03
  3.2712172e-03 -1.9101484e-03 -1.8624340e-03 -5.6956144e-04
 -1.5617535e-03 -2.3851227e-03 -1.4313431e-05 -4.3398165e-03
  3.9115595e-03 -3.0616210e-03  1.7589398e-03 -3.4103722e-03
  4.7280011e-03  1.9380470e-03 -3.3873315e-03  8.4065803e-04
  2.6089977e-03  1.7012059e-03 -2.7421617e-03 -2.2240754e-03
 -5.3690566e-04  2.9577864e-03  2.3726511e-03  3.2704175e-03
  2.0853498e-03 -1.1927494e-03 -2.1565862e-03 -9.0970926e-04
 -2.8641665e-04 -3.4961947e-03  1.1104723e-03  1.2320089e-03
 -5.9017556e-04 -3.0594901e-03  3.6974431e-03 -1.8557351e-03
 -3.8218759e-03  9.2711346e-04 -4.3113795e-03 -4.4118706e-03
  4.7748778e-03 -4.5557776e-03 -2.2665847e-03 -8.2379003e-04
 -7.9581753e-04 -1.3048936e-03  1.9261248e-03  3.1299898e-03
 -1.9034051e-03 -2.0335305e-03 -2.6451424e-03  1.7377195e-03
  6.7217485e-04 -2.4134698e-03  4.3735080e-03 -3.2599240e-03
 -2.2431149e-03  4.4288361e-03  1.4923669e-04 -2.2144278e-03
 -8.9370424e-04 -2.7281314e-04 -1.7176758e-03  1.2485087e-03
  1.3230384e-03  1.7001784e-04  3.5425189e-03 -1.7469387e-04]

"""



[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[ 2.6840649e-03  6.4447825e-04  3.4108965e-03  2.8856262e-03
  1.4741103e-03  2.5217915e-03  4.4861427e-03  1.5291966e-03
  3.8648255e-03 -1.2185492e-03 -3.0003262e-03 -4.5899902e-03
 -4.3055534e-04 -3.1804068e-03 -1.2025647e-03 -3.8910558e-04
 -4.4687049e-04  1.6551025e-03  9.9483936e-04  1.7923175e-03
 -2.4460943e-03  3.6044952e-03 -3.2551922e-03 -1.7311508e-03
  2.4985897e-03 -3.7432967e-03 -1.7055349e-03  4.5772786e-03
  1.9005014e-03 -1.8235850e-04 -2.7836591e-03 -4.4547254e-03
  4.5784693e-03 -3.9131506e-03 -2.5448117e-03 -7.4262702e-04
 -1.0316937e-03  1.5356285e-03  2.6558826e-03  4.6658679e-03
  3.9901026e-03  3.8648692e-03  1.0993927e-03 -3.2756033e-03
 -2.3666609e-03 -1.3976176e-03 -9.3585759e-04  1.2577592e-03
  4.8840763e-03 -4.0973085e-03 -1.9579716e-03 -4.7535361e-03
  3.9634466e-05  4.7453018e-03  1.3649703e-03  3.3736965e-03
  3.8982457e-03 -9.2950824e-05 -2.3905351e-03 -

"\n\n[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]\n\n[-3.0043968e-03  1.5343886e-03  4.0832465e-03  3.7239199e-03\n  4.9583608e-04  4.8461729e-03 -1.0620747e-03  8.2803884e-04\n  9.7367732e-04 -6.7797926e-05 -1.5526683e-03  1.8058836e-03\n -4.3851901e-03  4.7258494e-04  2.8616134e-03 -2.2246949e-03\n  2.7494587e-03 -3.5267104e-03  3.0259083e-03  2.7240592e-03\n  2.6110576e-03 -4.5409841e-03  4.9135066e-03  8.2884904e-04\n  2.7018311e-03  1.5654180e-03 -1.5859824e-03  9.3057036e-04\n  3.7275942e-03 -3.6502020e-03  2.8285771e-03 -4.2384453e-03\n  3.2712172e-03 -1.9101484e-03 -1.8624340e-03 -5.6956144e-04\n -1.5617535e-03 -2.3851227e-03 -1.4313431e-05 -4.3398165e-03\n  3.9115595e-03 -3.0616210e-03  1.7589398e-03 -3.4103722e-03\n  4.7280011e-03  1.9380470e-03 -3.3873315e-03  8.4065803e-04\n  2.6089977e-03  1.7012059e-03 -2.7421617e-03 -2.2240754e-03\n -5.3690566e-04  2.9577864e-03  2.3726511e-03  3.2704175e-03\n  2.0853498e-03 -1.192749

In [None]:
print(dir(w2v))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_input_data_sanity', '_check_training_sanity', '_clear_post_train', '_do_train_epoch', '_do_train_job', '_get_job_params', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_minimize_model', '_raw_word_count', '_save_specials', '_set_train_params', '_smart_save', '_train_epoch', '_train_epoch_corpusfile', '_update_job_params', '_worker_loop', '_worker_loop_corpusfile', 'accuracy', 'alpha', 'batch_words', 'build_vocab', 'build_vocab_from_freq', 'callbacks', 'cbow_mean', 'clear_sims', 'compute_loss', 'cor

In [None]:
print(dir(w2v.vocabulary))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_save_specials', '_scan_vocab', '_smart_save', 'add_null_word', 'create_binary_tree', 'cum_table', 'effective_min_count', 'load', 'make_cum_table', 'max_final_vocab', 'max_vocab_size', 'min_count', 'ns_exponent', 'null_word', 'prepare_vocab', 'raw_vocab', 'sample', 'save', 'scan_vocab', 'sort_vocab', 'sorted_vocab']


In [None]:
print(w2v.wv.vocab.keys())

dict_keys(['human', 'interface', 'computer', 'survey', 'user', 'system', 'response', 'time', 'eps', 'trees', 'graph', 'minors'])


In [None]:
print(w2v.wv["human"].size)

100


## 3. Extract Word2Vec

In [None]:
def gensim_to_keras(model):
    """
    converts a gensim word2vec model to a keras Embedding layer:

    model is a trained gensim word2vec models
    Returns: the trainable keras Embedding
    """
    return model.wv.get_keras_embedding()


In [None]:
print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(gensim_to_keras(w2v))


# [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
# Using TensorFlow backend.
# <keras.layers.embeddings.Embedding object at 0x7f72e2c1bd30>



[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
<keras.layers.embeddings.Embedding object at 0x7f8f23228c50>


## 4. FastText

In [None]:
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5, cbow=True, iterations=5, seed=0, workers=1):
    """
    creates and trains a genism fastText model:

    sentences is a list of sentences to be trained on
    size is the dimensionality of the embedding layer
    min_count is the minimum number of occurrences of a word for use in training
    window is the maximum distance between the current and predicted word within a sentence
    negative is the size of negative sampling
    cbow is a boolean to determine the training type; True is for CBOW; False is for Skip-gram
    iterations is the number of iterations to train over
    seed is the seed for the random number generator
    workers is the number of worker threads to train the model
    
    Returns: the trained model
    """
    return FastText(sentences=sentences, size=size, min_count=min_count, negative=negative, window=window, sg=(not cbow), iter=iterations, seed=seed, workers=workers)

In [None]:
print(common_texts[:2])
ft = fasttext_model(common_texts, min_count=1)
print(ft.wv["computer"])


# [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
# [-2.3464665e-03 -1.4542247e-04 -3.9549544e-05 -1.5817649e-03
#  -2.1579072e-03  4.5148263e-04  9.9494774e-04  3.2517681e-05
#   1.7035202e-04  6.8571279e-04 -2.0803163e-04  5.3083687e-04
#   1.2990861e-03  3.5418154e-04  2.1087916e-03  1.1022155e-03
#   6.2364555e-04  1.8612258e-05  1.8982493e-05  1.3051173e-03
#  -6.0260214e-04  1.6334689e-03 -1.0172457e-06  1.4247939e-04
#   1.1081318e-04  1.8327738e-03 -3.3656979e-04 -3.7365756e-04
#   8.0635358e-04 -1.2945861e-04 -1.1031038e-04  3.4695750e-04
#  -2.1932719e-04  1.4800908e-03  7.7851227e-04  8.6328381e-04
#  -9.7545242e-04  6.0775197e-05  7.1560958e-04  3.6474539e-04
#   3.3428212e-05 -1.0499550e-03 -1.2412234e-03 -1.8492664e-04
#  -4.8664736e-04  1.9178988e-04 -6.3863385e-04  3.3325219e-04
#  -1.5724128e-03  1.0003068e-03  1.7905374e-04  7.8452297e-04
#   1.2625050e-04  8.1183662e-04 -4.9907330e-04  1.0475471e-04
#   1.4351985e-03  4.9145994e-05 -1.4620423e-03  3.1466845e-03
#   2.0059240e-05  1.6659468e-03 -4.3319576e-04  1.3077060e-03
#  -2.0228853e-03  5.7626975e-04 -1.4056480e-03 -4.2292831e-04
#   6.4076332e-04 -8.5614284e-04  1.9028617e-04  6.0735084e-04
#   2.6121829e-04 -1.0566596e-03  1.0602509e-03  1.2843860e-03
#   7.9715136e-04  2.8305652e-04  1.9187009e-04 -1.0519206e-03
#  -8.2213630e-04 -2.1762338e-04 -1.7580058e-04  1.2764390e-04
#  -1.5695200e-03  1.3364316e-03 -1.5765150e-03  1.4802803e-03
#   1.5476452e-03  2.1928034e-04 -9.3281898e-04  3.2964293e-04
#  -1.0146293e-03 -1.3567278e-03  1.8070930e-03 -4.2649341e-04
#  -1.9074128e-03  7.1639987e-04 -1.3686880e-03  3.7073060e-03]



[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[-5.37086918e-04 -1.24788750e-03  4.26387560e-04 -1.64840661e-03
 -1.26233627e-03 -9.70027177e-04  1.58336700e-03 -3.69816145e-04
  3.54576798e-04  6.68553868e-04 -1.40044536e-03 -2.95279955e-04
 -1.74647244e-03  4.83299926e-04 -7.00303528e-04 -1.38734875e-03
 -9.34833719e-04  8.75205078e-05 -1.94834045e-03 -6.47016277e-04
 -4.64912155e-04 -4.82959120e-04 -7.10097316e-04  1.67512067e-03
  1.32140249e-03 -2.90164608e-04 -3.57706216e-04  2.55389675e-03
 -1.87956169e-03 -5.06602984e-04  9.76737123e-04 -1.34637428e-03
 -8.12131213e-04  1.45547325e-03  6.51960319e-04  8.00709473e-04
 -3.34904289e-05  8.94694065e-04  2.31606164e-03 -1.36979762e-03
 -1.48915546e-03 -2.37159710e-03 -2.29890775e-05 -5.33601909e-04
 -2.73135165e-03  2.07623071e-03  6.72957569e-04  1.70797308e-03
 -3.49627645e-03 -1.79815688e-03  4.18260635e-04 -1.50577875e-03
 -6.14957185e-04 -3.71963513e-04  7.68071914e-05 -1.4812

## 5. ELMo

In [None]:
import tensorflow_hub as hub
import tensorflow as tf

In [None]:
tf.compat.v1.disable_eager_execution()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

RuntimeError: ignored

In [None]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape


## Word2Vec Playground

In [None]:
from gensim.models import word2vec
from gensim.parsing import preprocessing

In [None]:
stopwords = """
a about after all also always am an and any are at be been being
but by came can cant come could did didn't do does doesn't doing
don't else for from get give goes going had happen has have having
how i if ill i'm in into is isn't it its i've just keep let like made
make many may me mean more most much no not now of only or our really
say see some something take tell than that the their them then they
thing this to try up us use used uses very want was way we what when
where which who why will with without wont you your youre uh gonna so like yknow theres""".replace("'", "").replace("\n", "").split(" ")
print(stopwords)

['a', 'about', 'after', 'all', 'also', 'always', 'am', 'an', 'and', 'any', 'are', 'at', 'be', 'been', 'being', 'but', 'by', 'came', 'can', 'cant', 'come', 'could', 'did', 'didnt', 'do', 'does', 'doesnt', 'doing', 'dont', 'else', 'for', 'from', 'get', 'give', 'goes', 'going', 'had', 'happen', 'has', 'have', 'having', 'how', 'i', 'if', 'ill', 'im', 'in', 'into', 'is', 'isnt', 'it', 'its', 'ive', 'just', 'keep', 'let', 'like', 'made', 'make', 'many', 'may', 'me', 'mean', 'more', 'most', 'much', 'no', 'not', 'now', 'of', 'only', 'or', 'our', 'really', 'say', 'see', 'some', 'something', 'take', 'tell', 'than', 'that', 'the', 'their', 'them', 'then', 'they', 'thing', 'this', 'to', 'try', 'up', 'us', 'use', 'used', 'uses', 'very', 'want', 'was', 'way', 'we', 'what', 'when', 'where', 'which', 'who', 'why', 'will', 'with', 'without', 'wont', 'you', 'your', 'youre']


In [None]:
filepath = "/content/drive/MyDrive/Colab Notebooks/Machine Learning/data/TAZ Ethersea text sample.txt"
punc = """!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~"""
with open(filepath, "r") as f:
    text = (preprocessing.stem(f.read()))
    text = preprocessing.remove_stopwords(text)
    text = text.translate(str.maketrans('', '', punc))
    text = text.split(".")
    # for sent in text:
    #  sent = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(sent))
    #text = preprocessing.strip
    #text = text.split(".")
    for i, sentence in enumerate(text):
      text[i] = sentence.split(" ")
      if i == 0:
        text[i][0] = text[i][0].translate(str.maketrans('', '', '\ufeff'))
print(text)

model = word2vec.Word2Vec(sentences=text)


"""
sentences: Any 
corpus_file: Any 
size: int = 100, 
alpha: float = 0.025, 
window: int = 5, 
min_count: int = 5, 
max_vocab_size: Any 
sample: float = 0.001, 
seed: int = 1, 
workers: int = 3, 
min_alpha: float = 0.0001, 
sg: int = 0, 
hs: int = 0, 
negative: int = 5, 
ns_exponent: float = 0.75, 
cbow_mean: int = 1, 
hashfxn: (__obj: object, /) -> int = hash, 
iter: int = 5, 
null_word: int = 0, 
trim_rule: Any, 
sorted_vocab: int = 1, 
batch_words: Any | int = MAX_WORDS_IN_BATCH, 
compute_loss: bool = False, 
callbacks: Any = (), 
max_final_vocab: Any
"""



[['a', 'brief', 'histori', 'wast', 'world', 'primer', 'brother', 'seldom'], ['', 'wit', 'firsthand', 'deep', 'rest', 'slumber', 'sermon', 'induc', 'congregation'], ['', 'so', 'attempt', 'thi', 'succinct'], ['', 'doze', 'dure', 'thi', 'lesson', 'well', 'wouldnt', 'worri', 'much'], ['', 'time', 'hear', 'this', 'world', 'im', 'describ', 'long', 'sinc', 'scour', 'map'], ['', 'let', 'begin', 'vestiges'], ['', 'accur', 'describ', 'gods', 'theyd', 'certainli', 'prefer', 'such'], ['', 'thei', 'cross', 'world', 'eon', 'ago', 'ancient', 'ancestor', 'consid', 'patrons'], ['', 'merciless', 'oppressors'], ['', 'civil', 'sprout', 'beings’', 'feet', 'immediately—', 'perhap', 'unsurprisingly—', 'thei', 'began', 'vy', 'superiority'], ['', 'differences', 'though', 'vestig', 'claim', 'world', 'agre', 'thing', 'oath', 'forsworn', 'power', 'magic', 'share', 'mortalkind'], ['th', 'vestig', 'benevol', 'furious', 'covet', 'ador', 'live', 'creatur', 'that', 'well'], [''], [''], ['', 'broke', 'oath'], ['', 'sha

'\nsentences: Any \ncorpus_file: Any \nsize: int = 100, \nalpha: float = 0.025, \nwindow: int = 5, \nmin_count: int = 5, \nmax_vocab_size: Any \nsample: float = 0.001, \nseed: int = 1, \nworkers: int = 3, \nmin_alpha: float = 0.0001, \nsg: int = 0, \nhs: int = 0, \nnegative: int = 5, \nns_exponent: float = 0.75, \ncbow_mean: int = 1, \nhashfxn: (__obj: object, /) -> int = hash, \niter: int = 5, \nnull_word: int = 0, \ntrim_rule: Any, \nsorted_vocab: int = 1, \nbatch_words: Any | int = MAX_WORDS_IN_BATCH, \ncompute_loss: bool = False, \ncallbacks: Any = (), \nmax_final_vocab: Any\n'

In [None]:
print(model.wv.vocab.keys())

dict_keys(['a', 'history', 'world', '', 'deep', 'so', 'well', 'wouldnt', 'time', 'this', 'im', 'long', 'map', 'let', 'vestiges', 'called', 'though', 'thing', 'magic', 'the', 'living', 'that', 'people', 'new', 'hominine', 'building', 'land', 'like', 'way', 'coastline', 'einarr', 'plateau', 'old', 'delmer', 'great', 'and', 'southern', 'archipelago', 'fun', 'stuff', 'point', 'it', 'quiet', 'weve', 'see', 'goes', 'kind', 'dangerous', 'energy', 'river', 'things', 'sea', 'us', 'waves', 'kingdoms', 'voice', 'you', 'um', 'it’s', 'travis', 'right', 'yeah', 'juice', 'dad', 'bit', 'different', 'sort', 'build', 'season', 'its', 'got', 'uh', 'guys', 'cause', 'thats', 'youre', 'here', 'were', 'gonna', 'play', 'game', 'playing', 'year', 'think', 'feel', 'bad', 'essentially', 'idea', 'youve', 'place', 'city', 'built', 'do', 'yknow', 'mean', 'maybe', 'basically', 'settlement', 'going', 'in', 'drawing', 'maps', 'draw', 'them', 'dont', 'know', 'making', 'heard', 'is', 'ive', 'underwater', 'a—', 'magical'

In [None]:
print(model.wv['magic'])
print(model.wv.most_similar('magic'))

[ 0.11070689 -0.04326563 -0.07025176 -0.02197586 -0.16566993 -0.15370531
  0.04380529  0.15827388 -0.1414026  -0.02773348 -0.01599527  0.1678507
 -0.00504224  0.01444732  0.00658129 -0.02415843 -0.05130475  0.07954128
  0.07823192 -0.03838196 -0.07973897  0.13395567  0.03927572  0.1232245
 -0.07007881  0.09781245 -0.07983482  0.04816478  0.07353617 -0.20476517
  0.02294433 -0.01362617 -0.00920658  0.03366044 -0.09999227 -0.16492425
 -0.06400285  0.10299286  0.06651793 -0.05999421  0.16078754 -0.14516774
 -0.18376403  0.08654997 -0.0686333   0.2369648   0.12015744  0.03571154
 -0.01063819 -0.02845631 -0.01453001  0.03623966 -0.01656967  0.16813534
 -0.08554276 -0.06004462 -0.14328283 -0.05528447 -0.07053018  0.01357325
 -0.09743027  0.12529448 -0.09605609  0.08073322  0.03529699 -0.13439211
  0.02007903 -0.20720272 -0.15754841  0.13191082 -0.20082024  0.11221711
  0.01020696 -0.12648289  0.08728204 -0.18385592  0.08640877 -0.10183995
  0.00490768  0.26000983 -0.15254776  0.14123559 -0.1

In [None]:
print(preprocessing.STOPWORDS)

frozenset({'front', 'every', 'inc', 'so', 'otherwise', 'something', 'until', 'hundred', 'hereafter', 'just', 'doesn', 'must', 'whereafter', 'many', 'yours', 'noone', 'less', 'himself', 'onto', 'thru', 'our', 'without', 'whoever', 'and', 'interest', 'above', 'hence', 'most', 'regarding', 'latterly', 'do', 'besides', 'full', 'did', 'others', 'whereas', 'amount', 'someone', 'became', 'myself', 'ten', 'other', 'indeed', 'everything', 'least', 'could', 'eg', 'beforehand', 'name', 'behind', 'beside', 'how', 'any', 'anyhow', 'somewhere', 'often', 'due', 'more', 'didn', 'him', 'per', 'there', 'over', 'is', 'itself', 'yourselves', 'really', 'against', 'thick', 'alone', 'few', 'else', 'beyond', 'wherever', 'mine', 'together', 'find', 'hereby', 'get', 'where', 'during', 'move', 'becomes', 'upon', 'whereupon', 'these', 'sometime', 'empty', 'mill', 'wherein', 'part', 'used', 'cant', 'using', 'into', 'nobody', 'in', 'your', 'serious', 'anything', 'whence', 'four', 'cry', 'should', 'are', 'its', 'why

## FastText Playground

In [None]:
stopwords = """
a about after all also always am an and any are at be been being
but by came can cant come could did didn't do does doesn't doing
don't else for from get give goes going had happen has have having
how i if ill i'm in into is isn't it its i've just keep let like made
make many may me mean more most much no not now of only or our really
say see some something take tell than that the their them then they
thing this to try up us use used uses very want was way we what when
where which who why will with without wont you your youre uh gonna so like yknow theres""".replace("'", "").replace("\n", "").split(" ")
print(stopwords)

filepath = "/content/drive/MyDrive/Colab Notebooks/Machine Learning/data/TAZ Ethersea text sample.txt"
punc = """!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~"""
with open(filepath, "r") as f:
    text = (preprocessing.stem(f.read()))
    text = preprocessing.remove_stopwords(text)
    text = text.translate(str.maketrans('', '', punc))
    text = text.split(".")
    # for sent in text:
    #  sent = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(sent))
    #text = preprocessing.strip
    #text = text.split(".")
    for i, sentence in enumerate(text):
      text[i] = sentence.split(" ")
      if i == 0:
        text[i][0] = text[i][0].translate(str.maketrans('', '', '\ufeff'))
print(text)

model = FastText(sentences=text)


"""
sentences=None 
corpus_file=None
sg=0
hs=0
size=100
alpha=0.025
window=5
min_count=5
max_vocab_size=None
word_ngrams=1
sample=0.001
seed=1
workers=3
min_alpha=0.0001
negative=5
ns_exponent=0.75
cbow_mean=1
hashfxn=hash
iter=5
null_word=0
min_n=3
max_n=6
sorted_vocab=1
bucket=2000000
trim_rule=None
batch_words=MAX_WORDS_IN_BATCH
callbacks=()
"""

['a', 'about', 'after', 'all', 'also', 'always', 'am', 'an', 'and', 'any', 'are', 'at', 'be', 'been', 'beingbut', 'by', 'came', 'can', 'cant', 'come', 'could', 'did', 'didnt', 'do', 'does', 'doesnt', 'doingdont', 'else', 'for', 'from', 'get', 'give', 'goes', 'going', 'had', 'happen', 'has', 'have', 'havinghow', 'i', 'if', 'ill', 'im', 'in', 'into', 'is', 'isnt', 'it', 'its', 'ive', 'just', 'keep', 'let', 'like', 'mademake', 'many', 'may', 'me', 'mean', 'more', 'most', 'much', 'no', 'not', 'now', 'of', 'only', 'or', 'our', 'reallysay', 'see', 'some', 'something', 'take', 'tell', 'than', 'that', 'the', 'their', 'them', 'then', 'theything', 'this', 'to', 'try', 'up', 'us', 'use', 'used', 'uses', 'very', 'want', 'was', 'way', 'we', 'what', 'whenwhere', 'which', 'who', 'why', 'will', 'with', 'without', 'wont', 'you', 'your', 'youre', 'uh', 'gonna', 'so', 'like', 'yknow', 'theres']
[['a', 'brief', 'histori', 'wast', 'world', 'primer', 'brother', 'seldom'], ['', 'wit', 'firsthand', 'deep', 'r



'\nsentences=None \ncorpus_file=None\nsg=0\nhs=0\nsize=100\nalpha=0.025\nwindow=5\nmin_count=5\nmax_vocab_size=None\nword_ngrams=1\nsample=0.001\nseed=1\nworkers=3\nmin_alpha=0.0001\nnegative=5\nns_exponent=0.75\ncbow_mean=1\nhashfxn=hash\niter=5\nnull_word=0\nmin_n=3\nmax_n=6\nsorted_vocab=1\nbucket=2000000\ntrim_rule=None\nbatch_words=MAX_WORDS_IN_BATCH\ncallbacks=()\n'

In [None]:
for meth in dir(model):
  if meth[0] != "_":
    print(meth)

accuracy
alpha
batch_words
bucket
build_vocab
build_vocab_from_freq
callbacks
cbow_mean
clear_sims
compute_loss
corpus_count
corpus_total_words
cum_table
doesnt_match
epochs
estimate_memory
evaluate_word_pairs
hashfxn
hs
init_sims
iter
layer1_size
load
load_binary_data
load_fasttext_format
max_n
min_alpha
min_alpha_yet_reached
min_count
min_n
model_trimmed_post_training
most_similar
most_similar_cosmul
n_similarity
negative
ns_exponent
num_ngram_vectors
random
running_training_loss
sample
save
sg
similar_by_vector
similar_by_word
similarity
struct_unpack
syn0_lockf
syn0_ngrams_lockf
syn0_vocab_lockf
syn1
syn1neg
total_train_time
train
train_count
trainables
vector_size
vocabulary
window
wmdistance
word_ngrams
workers
wv


In [None]:
print(model.wv.vocab.keys())
print(model.wv["magic"])
print(model.most_similar("magic"))

dict_keys(['a', 'world', '', 'deep', 'so', 'thi', 'dure', 'well', 'wouldnt', 'time', 'this', 'im', 'long', 'map', 'let', 'thei', 'though', 'vestig', 'thing', 'power', 'magic', 'share', 'live', 'that', 'hi', 'peopl', 'new', 'kingdom', 'hominine', 'build', 'land', 'like', 'way', 'war', 'wa', 'hominin', 'einarr', 'plateau', 'old', 'wai', 'delmer', 'great', 'and', 'southern', 'archipelago', 'work', 'fun', 'stuff', 'point', 'it', 'quiet', 'wev', 'see', 'ha', 'kind', 'everi', 'river', 'sea', 'ocean', 'turn', 'us', 'wave', 'everyth', 'mean', 'you', 'um', 'it’', 'sai', 'travis', 'becaus', 'hold', 'right', 'yeah', 'dad', 'bit', 'someth', 'differ', 'episod', 'sort', 'season', 'got', 'uh', 'gui', 'caus', 'your', 'here', 'wer', 'gonna', 'plai', 'game', 'year', 'think', 'feel', 'bad', 'essenti', 'idea', 'youv', 'place', 'citi', 'built', 'do', 'yknow', 'mayb', 'basic', 'ar', 'settlement', 'in', 'draw', 'maps', 'them', 'dont', 'know', 'mani', 'heard', 'the', 'is', 'iv', 'underwat', 'city', 'a—', 'did

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
for meth in dir(model.wv):
  if meth[0] != "_":
    print(meth)

accuracy
add
bucket
buckets_word
closer_than
cosine_similarities
distance
distances
doesnt_match
evaluate_word_analogies
evaluate_word_pairs
get_vector
hash2index
index2entity
index2word
init_sims
load
log_accuracy
log_evaluate_word_pairs
max_n
min_n
most_similar
most_similar_cosmul
most_similar_to_given
n_similarity
num_ngram_vectors
rank
save
save_word2vec_format
similar_by_vector
similar_by_word
similarity
similarity_matrix
syn0
syn0_ngrams
syn0_ngrams_norm
syn0_vocab
syn0_vocab_norm
syn0norm
vector_size
vectors
vectors_ngrams
vectors_ngrams_norm
vectors_norm
vectors_vocab
vectors_vocab_norm
vocab
wmdistance
word_vec
words_closer_than
wv


In [None]:
print(model.wv.evaluate_word_pairs())

TypeError: ignored