In [None]:
STOP_WORDS = [
  # REPLACE WITH YOUR OWN STOP WORDS
  "also", "not", "all", "am", "an", "and", "another", "any", "are", "as", "at", "be", "been", "being", "but", "by", "came", "can", "come", "did", "do", "for", "get", "got", "has", "had", "he", "have", "her", "here", "him", "himself", "his", "how", "if", "in", "into", "is", "it", "like", "me", "my", "of", "on", "or", "other", "our", "out", "over", "see", "still", "such", "take", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "those", "through", "to", "too", "up", "was", "way", "we", "well", "while", "with", "would", "you", "your", "a", "i", "will", "com", "may", "every", "using", "just", "need", "want", "years", "great", "good", "next", "know", "found", "add", "even", "use", "one", "something", "choice", "some", "more", "away", "really", "put", "instead", "start"
]

MIN_WORD_COUNT = 5
W2V_EPOCHS = 50

In [None]:
import re # REGEX
import os
import nltk
import string
import multiprocessing
from random import randrange
from collections import defaultdict
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases

nltk.download('punkt', quiet=True)

True

In [None]:
cores = multiprocessing.cpu_count()
cores

2

In [None]:
corpus = ""
files = []

for r, d, f in os.walk("data"):
  for file in f:
    if '.txt' in file:
      files.append(os.path.join(r, file))

for file in files:
  with open(file, "r") as f:
    corpus += "\n" + f.read()

In [None]:
def remove_not_valid_chars(sent, include_spaces=True):
  sent_lowercase = sent.lower()
  if include_spaces:
      return re.sub(r'[^a-z0-9- ]+', ' ', sent_lowercase)
  return re.sub(r'[^a-z0-9-]+', ' ', sent_lowercase)

def most_frequent_words(phraser, sents, num, min_word_len=0):
    word_freq = defaultdict(int)

    for sent in phraser[sents]:
        for i in sent:
            if i not in STOP_WORDS and len(i.split("_")) >= min_word_len:              
              word_freq[i] += 1
    
    words = []
    for k in sorted(word_freq, key=word_freq.get, reverse=True)[:num]:        
      words.append(k)
        
    return words

In [None]:
random_sample_size = 100
random_sample_start = randrange(0, len(corpus) - random_sample_size)
random_sample = corpus[random_sample_start:random_sample_start + random_sample_size]

sentence_test = remove_not_valid_chars(random_sample.replace("\n", " "))

random_sample, sentence_test

('in and push it into place. Easy.\n\nStep 5\n\nBefore placing the motherboard into the case, make sure th',
 'in and push it into place  easy   step 5  before placing the motherboard into the case  make sure th')

In [None]:
sentences_list = nltk.sent_tokenize(corpus)

sentences = []

for sent in sentences_list:
  clean_words = []
  words = nltk.word_tokenize(sent)

  for word in words:
    w = remove_not_valid_chars(word, False)
    if w and len(w) > 1 and not w.isdigit() and w not in list(string.punctuation) and w not in STOP_WORDS:
      clean_words.append(w)

  if len(clean_words) > 2:
    sentences.append(clean_words)

len(sentences)

9089

In [None]:
bigram = Phrases(sentences, min_count=MIN_WORD_COUNT, threshold=MIN_WORD_COUNT, common_terms=STOP_WORDS)
bigram_model = Phraser(bigram)
trigram = Phrases(bigram[sentences], min_count=MIN_WORD_COUNT/2, threshold=10, common_terms=STOP_WORDS)
trigram_model = Phraser(trigram)
phraser = trigram_model[bigram_model[sentences]]



In [None]:
trigram_model[sentence_test.split(" ")]

['in',
 'and',
 'push',
 'it',
 'into',
 'place',
 'easy',
 'step',
 '5',
 'before',
 'placing',
 'the',
 'motherboard',
 'into',
 'the',
 'case',
 'make_sure',
 'th']

In [None]:
most_frequent_words(trigram_model, sentences, 25, 1)

[' s',
 'build',
 'pc',
 'motherboard',
 'case',
 'n t',
 'so',
 'computer',
 'from',
 'cpu',
 'components',
 'what',
 'best',
 ' ll',
 'which',
 'gaming_pc',
 'should',
 'gaming',
 'ram',
 'parts',
 'power_supply',
 'when',
 'new',
 'laptop',
 ' re']

In [None]:
w2v_model = Word2Vec(
    size=100,
    min_count=5,
    workers=cores
)

w2v_model.build_vocab(phraser)

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=W2V_EPOCHS)

(4795197, 6190550)

In [None]:
w2v_model.wv.most_similar("", topn=50)

[('explained', 0.7875093221664429),
 ('walk', 0.7133488655090332),
 ('appreciated', 0.6909220218658447),
 ('infographic', 0.6788451671600342),
 ('walkthrough', 0.6617275476455688),
 ('detailed', 0.6586351990699768),
 ('beginners', 0.6563247442245483),
 ('assembly', 0.635240912437439),
 ('myself', 0.6286596655845642),
 ('tutorial', 0.6234084367752075),
 ('all-in-one', 0.61463463306427),
 ('comprehensive', 0.6136826872825623),
 ('basics', 0.5867540836334229),
 ('in-depth', 0.5814931392669678),
 ('submit', 0.5760713815689087),
 ('advertisement', 0.572301983833313),
 ('helping', 0.5702395439147949),
 ('team', 0.5690051317214966),
 ('followed', 0.567746639251709),
 ('topics', 0.5654361248016357),
 ('congratulations', 0.5641900300979614),
 ('photos', 0.5575026273727417),
 ('detail', 0.55533367395401),
 ('pictures', 0.5552259683609009),
 ('views', 0.5538419485092163),
 ('mat', 0.5518805384635925),
 ('ideas', 0.548680305480957),
 ('finish', 0.5484687089920044),
 ('confidence', 0.54679131507873