In [1]:
#import necessary modules, libraries
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import Word2Vec, LdaMulticore

from gensim.corpora import Dictionary

import numpy as np 
import pandas as pd
import glob
import pyLDAvis
from pyLDAvis import gensim 

from textblob import TextBlob

In [2]:
#Read data into DataFrame: concatenate scraped data from all years for jackbones.com
df = pd.concat([pd.read_csv(f) for f in glob.glob('jake scrape/*.csv')])

In [3]:
#Get list of all sentences in summary content
content = ' '.join(df['summary'])
sentences = TextBlob(content).sentences
sent_list=[]
for s in sentences:
    sent_list.append(str(s))
    
#write all sentences to .txt  
with open('jakebones.txt', 'w', encoding='utf-8') as f:
    for line in sent_list:
        f.write(line)
        f.write('\n')
f.close()    


In [4]:
def preprocessing(sentence):
    """ Converts a document into a list of lowercase tokens, ignoring tokens that are too short or too long and stopwords."""
    return [word for word in simple_preprocess(sentence) \
            if word not in STOPWORDS]

def read_sentences(filename):
    """ Performs def preprocessing on given file one sentence (line) and the time"""
    with open(filename, 'rb') as f:
        for line in f:
            yield preprocessing(line)
        

In [None]:
#Get list of preprocessed sentences for jackbones.com (all years)
%time sentences = list(read_sentences('jakebones.txt'))

Model

In [None]:
#Train word2vec model (window, min_count: adjust if2needed)
%time model = Word2Vec(sentences, window=1, min_count=3)

In [None]:
#get synonyms in trained model
model.wv.most_similar('good')

In [None]:
#Perform mathematical operation on words: amazing+good-bad
model.wv.most_similar(positive=['amazing', 'good'], negative=['bad'])

In [None]:
model.wv.most_similar('baby')

Topic modeling - LDA


In [None]:
#Get randomly permuted sentences 
sentences_light = np.random.permutation(sentences)

In [13]:
#Get 2000 of randomly permuted sentences
sentences_light = sentences_light[:2000]

In [None]:
#Get dictionary of normalized words and their ids 
%time dictionary = Dictionary(sentences_light)

In [None]:
#Convert doc into bag-of-words (BoW) for each sentence in sentences_light
%time bow_corpus = [dictionary.doc2bow(sent) for sent in sentences_light]

In [None]:
#train lda_model
%time lda_model = LdaMulticore(bow_corpus, id2word=dictionary, num_topics=100, passes=20, workers=8)

In [None]:
#Show topics, index in trained lda_model
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {}\nWords: {}'.format(idx, topic))

In [None]:
#Visualize topics with pyLDAvis
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)