In [1]:
import pandas as pd 
import numpy as np

from gensim.corpora import Dictionary
from gensim.models import LdaModel

from tqdm import tqdm
tqdm.pandas()

In [2]:
comments_df = pd.read_pickle('data/comments')
posts_df = pd.read_pickle('data/posts')
patches_df = pd.read_csv('data/patches.csv')

### Latent Dirichlet Allocation

#### Playground 

Off the shelf LDA taken from https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html

In [37]:
dictionary = Dictionary(pubs_all)
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [38]:
corpus = [dictionary.doc2bow(doc) for doc in pubs_all]

In [39]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6024
Number of documents: 62964


In [40]:
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = 1  

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

CPU times: user 17min 4s, sys: 11.1 s, total: 17min 15s
Wall time: 18min 17s


In [42]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -2.8009.
[([(0.06556945, 'game'),
   (0.026569752, 'like'),
   (0.02018021, 'people'),
   (0.019785108, 'epic'),
   (0.016988035, 'would'),
   (0.01672669, 'fortnite'),
   (0.014101612, 'know'),
   (0.0132098235, 'make'),
   (0.012737307, 'think'),
   (0.012082423, 'really'),
   (0.011860207, 'play'),
   (0.011352798, 'player'),
   (0.011336969, 'want'),
   (0.011265897, 'time'),
   (0.010313704, 'feel'),
   (0.009345666, 'even'),
   (0.009324148, 'good'),
   (0.009105417, 'much'),
   (0.009022037, 'playing'),
   (0.008926455, 'please')],
  -1.4832803749188408),
 ([(0.021356048, 'building'),
   (0.016989546, 'time'),
   (0.016485233, 'build'),
   (0.015139148, 'someone'),
   (0.011494506, 'around'),
   (0.011236549, 'fight'),
   (0.009733567, 'kill'),
   (0.00914682, 'first'),
   (0.008902653, 'place'),
   (0.008782564, 'second'),
   (0.00855592, 'wall'),
   (0.008311509, 'take'),
   (0.0073244763, 'circle'),
   (0.0067527173, 'shot'),
   (0.006670309, 'right')

#### Making Documents Longer

In [3]:
# We combine the title and body of each post
def add_lists(row, ngrams=False): 
    if ngrams: 
        title = 'title_ngrams'
        selftext = 'selftext_ngrams'
        combo = 'combo_post_ngrams'
    else: 
        title = 'title_clean'
        selftext = 'selftext_clean'
        combo = 'combo_post'

        
    row[combo] = row[title] + row[selftext]
    
    return row

In [4]:
posts_df = posts_df.apply(lambda x : add_lists(x), axis=1)

In [5]:
posts_df = posts_df.apply(lambda x : add_lists(x, True), axis=1)

We will now add all the comments that belong to a post to the post body. One problem with this approach is that the last and first words of comments will no be considered in the same context. Is this fair? 

In [6]:
#fix comment parent ids, we are losing the type of comment 
comments_df['parent_id'] = comments_df['parent_id'].apply(lambda x : x.split('_')[1])

In [7]:
def comments2discussion(posts, comments, ngrams=False):
    discussion = []
    post_body = 'combo_post'
    com_body = 'body_clean'
    
    if ngrams: 
        post_body = post_body + '_ngrams'
        com_body = 'body_ngrams'
        
    for post_id in posts['id']:
        body = []
        try: 
            body.extend(posts[posts['id'] == post_id][post_body])
            body = body[0]
        except:
            body.extend(posts[posts['id'] == post_id][post_body])
        
        children = comments[comments['parent_id'] == post_id]
        
        for child in children[com_body]:
            body.extend(child)
    
        discussion.append(body)
        
    return discussion

In [8]:
comp_df = posts_df[posts_df['subreddit'] == 'FortniteCompetitive']
comp_all_ngrams = comments2discussion(comp_df, comments_df, True)
comp_all = comments2discussion(comp_df, comments_df)

In [9]:
pubs_df = posts_df[posts_df['subreddit'] == 'FortNiteBR']
pubs_all_ngrams = comments2discussion(pubs_df, comments_df, True)
pubs_all = comments2discussion(pubs_df, comments_df)