In [None]:
import re
import os
import nltk
import string
import multiprocessing
from random import randrange
from collections import defaultdict
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases

In [None]:
nltk.download('punkt', quiet=True)

True

In [None]:
cores = multiprocessing.cpu_count()
cores

2

In [None]:
PUNCT = list(string.punctuation)
STOP_WORDS = set(["also", "not", "all", "am", "an", "and", "another", "any", "are", "as", "at", "be", "been", "being", "but", "by", "came", "can", "come", "did", "do", "for", "get", "got", "has", "had", "he", "have", "her", "here", "him", "himself", "his", "how", "if", "in", "into", "is", "it", "like", "me", "my", "of", "on", "or", "other", "our", "out", "over", "see", "still", "such", "take", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "those", "through", "to", "too", "up", "was", "way", "we", "well", "while", "with", "would", "you", "your", "a", "i", "will", "com", "may", "every", "using", "just", "need", "want", "years", "great", "good", "privacy", "next", "know", "found", "add", "even", "use", "one", "something", "choice", "some", "more", "away", "really", "put", "instead", "start"])
MIN_WORD_COUNT = 5
W2V_EPOCHS = 50

In [None]:
def remove_not_valid_chars(word, include_space=True):
  word_lowercase = word.lower()
  if include_space:
    return re.sub(r'[^a-zA-Z0-9- ]+', '', word_lowercase)
  return re.sub(r'[^a-zA-Z0-9-]+', '', word_lowercase)

In [None]:
def most_frequent_words(phraser, sents, num, min_word_len=0):
    word_freq = defaultdict(int)

    for sent in phraser[sents]:
        for i in sent:
            if i not in STOP_WORDS and len(i.split("_")) >= min_word_len:
              word_freq[i] += 1
    
    words = []
    for k in sorted(word_freq, key=word_freq.get, reverse=True)[:num]:        
      words.append(k)
        
    return words

In [None]:
files = []

for r, d, f in os.walk("corpus"):
    for file in f:
        if '.txt' in file:
            files.append(os.path.join(r, file))

corpus = ""

for file in files:
    with open(file, "r") as f:
        corpus += "\n" + f.read()

In [None]:
random_sample_size = 100
random_sample_start = randrange(0, len(corpus) - random_sample_size)
random_sample = corpus[random_sample_start:random_sample_start + random_sample_size]
sentence_test = remove_not_valid_chars(random_sample.replace("\n", " "))
random_sample, sentence_test

('pping Business with AliExpress + BONUS: 25 niche ideas for 2021 Get Your Guide Make your start\n100% ',
 'pping business with aliexpress  bonus 25 niche ideas for 2021 get your guide make your start 100 ')

In [None]:
sentences_list = nltk.sent_tokenize(corpus)

In [None]:
sentences = []

for sent in sentences_list:
  clean_words = []
  words = nltk.word_tokenize(sent)

  for word in words:
      w = remove_not_valid_chars(word, False)
      if w and len(w) > 1 and not w.isdigit() and w not in PUNCT and w not in STOP_WORDS:
          clean_words.append(w)

  if len(clean_words) > 2:
      sentences.append(clean_words)

In [None]:
len(sentences)

73385

In [None]:
bigram = Phrases(sentences, min_count=MIN_WORD_COUNT, threshold=MIN_WORD_COUNT, common_terms=STOP_WORDS)
bigram_model = Phraser(bigram)
trigram = Phrases(bigram[sentences], min_count=MIN_WORD_COUNT, threshold=10, common_terms=STOP_WORDS)
trigram_model = Phraser(trigram)
phraser = trigram_model[bigram_model[sentences]]



In [None]:
trigram_model[sentence_test.split()]

['pping',
 'business',
 'with',
 'aliexpress',
 'bonus',
 '25',
 'niche',
 'ideas',
 'for',
 '2021',
 'get',
 'your',
 'guide',
 'make',
 'your',
 'start',
 '100']

In [None]:
most_frequent_words(trigram_model, sentences, 50, 2)

['social_media',
 'blog_post',
 'blog_posts',
 'make_sure',
 'content_marketing',
 'does_nt',
 'search_engines',
 'search_engine',
 'google_analytics',
 'website_traffic',
 'drive_traffic',
 'ca_nt',
 'people_who',
 'make_money',
 'promote_blog',
 'email_marketing',
 'target_audience',
 'most_popular',
 'page_views',
 'google_search',
 'email_list',
 'guest_post',
 'guest_blogging',
 'when_comes',
 'learn_about',
 'traffic_sources',
 'keyword_research',
 'web_traffic',
 'so_much',
 'search_results',
 'digital_marketing',
 'guest_posting',
 'organic_traffic',
 'thanks_sharing',
 'write_about',
 'grow_blog',
 'landing_page',
 'facebook_twitter',
 'think_about',
 'email_address',
 'many_people',
 'blogging_platform',
 'increase_traffic',
 'per_month',
 'right_now',
 're_going',
 'search_traffic',
 'most_important',
 'says_january',
 'content_strategy']

In [None]:
w2v_model = Word2Vec(
    size=300,    
    min_count=25,
    workers=2,    
)

In [None]:
w2v_model.build_vocab(phraser)

In [None]:
len(w2v_model.wv.vocab)

3613

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=W2V_EPOCHS)

(30450055, 38997400)

In [None]:
w2v_model.wv.most_similar('search_engines', topn=25)

[('guest', 0.21181611716747284),
 ('companies', 0.19697968661785126),
 ('after_reading', 0.18953336775302887),
 ('email_subscribers', 0.18488603830337524),
 ('organic_search', 0.1837301254272461),
 ('website_browser_time_comment', 0.175649493932724),
 ('box', 0.1734764128923416),
 ('random', 0.16592027246952057),
 ('self', 0.16452457010746002),
 ('alongside', 0.16433827579021454),
 ('normal', 0.16246196627616882),
 ('day', 0.16152776777744293),
 ('single_day', 0.15829706192016602),
 ('magic', 0.15793749690055847),
 ('recently', 0.15665170550346375),
 ('reply_ana_hoffman_says', 0.15605425834655762),
 ('talked_about', 0.1552238017320633),
 ('six_months', 0.15298306941986084),
 ('allows', 0.15270386636257172),
 ('provide_value', 0.15167124569416046),
 ('best_motorcycle', 0.15102478861808777),
 ('everything', 0.1507202833890915),
 ('craft', 0.1466902792453766),
 ('comments', 0.14348751306533813),
 ('evernote', 0.1425704061985016)]

In [None]:
w2v_model.wv.most_similar('content', topn=25)

[('posts', 0.47531020641326904),
 ('post', 0.4056031107902527),
 ('article', 0.36833250522613525),
 ('articles', 0.3570299744606018),
 ('contents', 0.30925291776657104),
 ('infographics', 0.3048214018344879),
 ('piece', 0.30464625358581543),
 ('audience', 0.3031178414821625),
 ('blog', 0.2964682877063751),
 ('high-quality', 0.2806640863418579),
 ('headlines', 0.27288877964019775),
 ('evergreen', 0.26462656259536743),
 ('calendar', 0.2541588246822357),
 ('original', 0.25325798988342285),
 ('compelling', 0.25311416387557983),
 ('strategy', 0.2527120113372803),
 ('consistently', 0.2518334984779358),
 ('social', 0.2482859343290329),
 ('headline', 0.24391379952430725),
 ('readers', 0.24307097494602203),
 ('e-book', 0.24167749285697937),
 ('material', 0.24143272638320923),
 ('what', 0.23813742399215698),
 ('attracts', 0.2373666763305664),
 ('constantly', 0.23721443116664886)]

In [None]:
w2v_model.wv.most_similar('blog', topn=100)

[('blogs', 0.3651251494884491),
 ('website', 0.3535810112953186),
 ('site', 0.3122475743293762),
 ('blogging', 0.3064546585083008),
 ('readership', 0.3043372929096222),
 ('content', 0.29646825790405273),
 ('blogger', 0.28374987840652466),
 ('post', 0.2831146717071533),
 ('posts', 0.27177125215530396),
 ('article', 0.2566870450973511),
 ('roundup', 0.25163590908050537),
 ('older', 0.2420521229505539),
 ('so', 0.23325422406196594),
 ('guest', 0.233115553855896),
 ('email', 0.22862011194229126),
 ('articles', 0.22830119729042053),
 ('bloggers', 0.22787974774837494),
 ('consistently', 0.22782768309116364),
 ('epic', 0.22678285837173462),
 ('readers', 0.2267823964357376),
 ('new', 0.2255745530128479),
 ('proven', 0.2173464000225067),
 ('guide', 0.21732200682163239),
 ('after', 0.2095438539981842),
 ('worthy', 0.20862987637519836),
 ('tips', 0.20599307119846344),
 ('list', 0.20420223474502563),
 ('killer', 0.20370282232761383),
 ('subscribers', 0.2029207944869995),
 ('viral', 0.2012309432029