In [1]:
from pprint import pprint
import praw
from praw.models import MoreComments
import requests
import json
import numpy as np
import pandas as pd
from collections import Counter
from math import ceil
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams

import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import word_tokenize

from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

from math import floor

In [2]:
client_id = '71ZX5Cupn2Ohpg'
client_secret = 'nzCz5_WlQM4LbJxX-t_3m-tPgZw'

reddit = praw.Reddit(user_agent='Comment Extraction',client_id=client_id, client_secret=client_secret)
subreddit = reddit.subreddit('AmITheAsshole')

judgement_categories = ['YTA', 'NTA', 'ESH', 'NAH', 'INFO']

In [3]:
class Submission_Corpus:
    def __init__(self, submission, bfs_depth=2, judgement_categories=None, judgement_weight='upvotes'):
        self.comment_meta = []
        self.judgement_categories = judgement_categories
        self.judgement_weight = judgement_weight
        self.judgements = {}
        for category in self.judgement_categories:
            self.judgements[category] = []

        self.submission = submission
        self.original_post = submission.selftext
        self.comment_forrest = self.submission.comments
        self.comment_bfs(bfs_depth=bfs_depth)       
        
    def comment_bfs(self, bfs_depth=0):
        # Initialize queue to hold comment sub trees during BFS.
        bfs_queue = []
        
        # Populate queue with first level of comments.
        for comment in self.comment_forrest:
            bfs_queue.append(comment)
            
        current_level_size = len(bfs_queue)
        next_level_size = 0
        level_count = 0
        
        while (len(bfs_queue) > 0) and (level_count < bfs_depth):
            comment = bfs_queue.pop(0)
            current_level_size -= 1
            next_level_size += 1
            
            comment_features = None
            try:
                comment_features = self.extract_comment(comment)
            except:
                pass

            if comment_features is not None:
                self.comment_meta.append(comment_features)
                for reply in comment.replies:
                    bfs_queue.append(reply)
            
            if current_level_size == 0:
                current_level_size = next_level_size
                level_count += 1
            
    def extract_comment(self, comment, judgement_extraction_method='prefix', judgement_weighting='upvotes'):
        judgement = self.extract_judgement(comment.body, extraction_method=judgement_extraction_method)
        score = comment.score if judgement_weighting=='upvotes' else 1
        self.judgements[judgement].append(score)
        body = self.tokenize_comment(comment.body)
        comment_features = {
            'id' : comment.id,
            'author': comment.author,
            'body': body,
            'score' : score,
            'judgement': judgement
        }
        return(comment_features)
    
    def extract_judgement(self, txt, extraction_method='prefix'):
        if extraction_method == 'prefix':
            for category in self.judgement_categories:
                if txt[:len(category)] == category:
                    return(category)
    
    def summarize_judgement(self):
        total_judgements = sum([sum(count) for count in self.judgements.values()])
        judgement_summary = [(category, sum(count)/total_judgements) for category, count in self.judgements.items()]
        return(judgement_summary)
    
    def get_judgement_summary(self):
        return(self.judgement_summary)
    
    def tokenize_sentence(self, sent):
        try:
            tokenized_sent = word_tokenize(sent)
        except:
            tokenized_sent = []
        tokenized_sent.insert(0, '<s>')
        tokenized_sent.append('</s>')
        return(tokenized_sent)
    
    def tokenize_comment(self, comment_txt):
        sentences = sent_detector.tokenize(comment_txt.strip())
        tokenized_sentences = [self.tokenize_sentence(sent) for sent in sentences]
        tokenized_comment = ['<c>']
        for sent in tokenized_sentences:
            tokenized_comment += sent
        tokenized_comment += ['</c>']
        return(tokenized_comment)
        
    def get_commentCorpus(self):
        comments_by_category = {}
        for category in self.judgement_categories:
            comments_by_category[category] = []
        
        
        for comment in self.comment_meta:
            if self.judgement_weight == 'upvotes':
                for i in range(max(5,floor(comment['score']/50))):
                    comments_by_category[comment['judgement']].append(comment['body'])
                    
            else:
                comments_by_category[comment['judgement']].append(comment['body'])            
                
        return(comments_by_category)

In [41]:
SC = Submission_Corpus(reddit.submission(id='dv9ogm'), bfs_depth=1, judgement_categories=judgement_categories, judgement_weight='none')

In [42]:
SC.summarize_judgement()

[('YTA', 0.9961745892386805),
 ('NTA', 0.0),
 ('ESH', 0.0),
 ('NAH', 0.0),
 ('INFO', 0.0038254107613194532)]

In [43]:
cc = SC.get_commentCorpus()

In [7]:
print(cc['YTA'][0])

['<c>', '<s>', 'YTA', 'For', 'making', 'a', 'validation', 'post', '</s>', '</c>']


In [8]:
for i in ngrams(cc['NTA'][0],3):
    print(i)

('<c>', '<s>', 'NTA')
('<s>', 'NTA', '.')
('NTA', '.', '</s>')
('.', '</s>', '<s>')
('</s>', '<s>', 'You')
('<s>', 'You', 'were')
('You', 'were', 'uncomfortable')
('were', 'uncomfortable', 'with')
('uncomfortable', 'with', 'some')
('with', 'some', 'behaviors')
('some', 'behaviors', ',')
('behaviors', ',', 'you')
(',', 'you', 'addressed')
('you', 'addressed', 'it')
('addressed', 'it', 'directly')
('it', 'directly', '.')
('directly', '.', '</s>')
('.', '</s>', '<s>')
('</s>', '<s>', 'If')
('<s>', 'If', 'he')
('If', 'he', 'wants')
('he', 'wants', 'to')
('wants', 'to', 'respect')
('to', 'respect', 'your')
('respect', 'your', 'boundaries')
('your', 'boundaries', 'your')
('boundaries', 'your', 'friendship')
('your', 'friendship', 'can')
('friendship', 'can', 'move')
('can', 'move', 'on')
('move', 'on', ',')
('on', ',', 'if')
(',', 'if', 'not')
('if', 'not', 'the')
('not', 'the', 'responsibility')
('the', 'responsibility', 'is')
('responsibility', 'is', 'on')
('is', 'on', 'him')
('on', 'him',

In [9]:
def make_ngram_corpus(corpus, N=3):
    ngram_tups_list = list(ngrams(corpus, N))
    ngram_corpus = []
    for gi in ngram_tups_list:
        if not gi[0:N-1] in [x[0] for x in ngram_corpus]:
            next_word_dict = {}
            
            for gj in ngram_tups_list:
                if gj[0:N-1] == gi[0:N-1]:
                    if gj[N-1] in next_word_dict.keys():
                        next_word_dict[gj[N-1]] += 1
                    else:
                        next_word_dict[gj[N-1]] = 1
            gi_count = sum(next_word_dict.values())
            next_word_prob_tups = tuple([(key, value/gi_count) for key, value in next_word_dict.items()])
            ngram_corpus.append((gi[0:N-1], gi_count, next_word_prob_tups))
    return(ngram_corpus)
    

In [10]:
flat_comment_corpus = []
for comment in cc['NTA']:
    for token in comment:
        flat_comment_corpus.append(token)
        
ngram_corpus = make_ngram_corpus(flat_comment_corpus,3)

In [11]:
def ngram_generate_comment(corpus, N):
    comment_starts = []
    for gi  in corpus:
        if gi[0][0] == '<c>':
            comment_starts.append(gi)
    num_starts = sum([gi[1] for gi in comment_starts])
    probs=[gi[1]/num_starts for gi in comment_starts]
    comment_start = comment_starts[np.random.choice(list(range(len(comment_starts))), p=probs)]
    comment = [w for w in comment_start[0]]
    while comment[-1] != '</c>':
        prev_gram = tuple(comment[-(N-1):])
        nnext_choices = None
        for gi in corpus:
            if gi[0] == prev_gram:
                next_choices = gi[2]
        if next_choices is not None:
            next_probs = [x[1] for x in next_choices]
            next_word = next_choices[np.random.choice(list(range(len(next_choices))),p=next_probs)][0]
            comment.append(next_word)
        else:
            print('N-Gram not found in corpus')
    return(comment)

In [12]:
for i in range(10):
    comment = ' '.join(ngram_generate_comment(ngram_corpus, 3))
    print(comment)

<c> <s> NTA . </s> </c>
<c> <s> NTA . </s> <s> If you were a girl ? '' </s> <s> Of course not . </s> <s> This is very overbearing and controlling behaviour but this doesn ’ t necessarily mean it ’ s gay . </s> <s> Straight men , straight women and lesbian women could do this . </s> <s> If he wants to respect how you feel . </s> </c>
<c> <s> NTA- Just came out . </s> <s> Some people are just very affectionate or jealous . </s> <s> Nevertheless , if any of my straight friends of being pussy-whipped or other offensive phrases like that to express their disappointment at the fact that they are touching you and is ignoring your requests to reduce behaviour that ’ s because he ’ s because he 's gay . </s> <s> Edit : spelling </s> </c>
<c> <s> NTA . </s> </c>
<c> <s> NTA . </s> <s> Edit : spelling </s> </c>
<c> <s> NTA . </s> <s> Edit : spelling </s> </c>
<c> <s> NTA . </s> <s> This is very overbearing and controlling behaviour but this doesn ’ t necessarily mean it ’ s gay . </s> </c>
<c> <s

In [44]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, Dense, Activation
from gensim.models import Word2Vec

In [45]:
def build_rnn_corpus(cc, wv, N=20):
    all_comments = []
    all_words = []
    category_inds = np.arange(len(judgement_categories))
    np.random.shuffle(category_inds)

    for category_ind in category_inds:
        comments = cc[judgement_categories[category_ind]]
        comment_inds = np.arange(len(comments))
        np.random.shuffle(comment_inds)
        for comment_ind in comment_inds:
            all_comments.append(comments[comment_ind])
            for token in comments[comment_ind]:
                all_words.append(token)
                
    all_ngrams = ngrams(all_words,N)
    X_train = []
    y_train = []
    for ngram in all_ngrams:
        X_train.append([wv[token] for token in ngram[:-1]])
        y_train.append(wv[ngram[-1]])

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    return(X_train, y_train)

In [46]:
all_comments = []
all_words = []
category_inds = np.arange(len(judgement_categories))
np.random.shuffle(category_inds)

for category_ind in category_inds:
    comments = cc[judgement_categories[category_ind]]
    comment_inds = np.arange(len(comments))
    np.random.shuffle(comment_inds)
    for comment_ind in comment_inds:
        all_comments.append(comments[comment_ind])
        for token in comments[comment_ind]:
            all_words.append(token)

word2vec = Word2Vec(all_comments, min_count=1)            

In [47]:
#all_ngrams = ngrams(all_words,5)
#X_train = []
#y_train = []
#for ngram in all_ngrams:
#    X_train.append([word2vec.wv[token] for token in ngram[:-1]])
#    y_train.append(word2vec.wv[ngram[-1]])
#    
#X_train = np.array(X_train)
#y_train = np.array(y_train)
#X_train = np.array([word2vec.wv[token] for token in all_words])
#y_train

In [48]:
#X_train, y_train = build_rnn_corpus(cc, word2vec.wv)

In [49]:
model = Sequential()
model.add(GRU(units=32))
model.add(Dense(100))
model.compile(optimizer='adam', loss='mean_squared_error')

In [50]:
X_train, y_train = build_rnn_corpus(cc, word2vec.wv)
model.fit(X_train[:,:,:], y_train[:], shuffle=False, batch_size=16, epochs=1)
    

Epoch 1/1


<keras.callbacks.History at 0x7f8e6baaf1d0>

In [51]:
y_pred = model.predict(X_train[:0,:,:])

In [52]:
s = [word2vec.similar_by_vector(X_train[0,i,:], topn=1)[0][0] for i in range(X_train.shape[1])]
for pred in y_pred:
    s.append(word2vec.similar_by_vector(pred, topn=1)[0][0])
print(s)


['<c>', '<s>', 'YTA', '.', '</s>', '<s>', 'You', 'apparently', 'knew', 'that', 'she', 'is', '*really*', 'against', 'having', 'kids', ',', 'and', 'instead', 'to', 'of', '<s>', '<s>', '.', '.', '.', '’', 'you', 'to', 'to', 'to', '<s>', '<s>', '<s>', '<s>', '<s>', '<s>', '<s>', 'I', '<s>', '<s>', '<s>', '<s>', '<s>', '.', '.', '</s>', 'to', 'you', 'to', '<s>', '<s>', '<s>', '<s>', '<s>', '<s>', '.', '<s>', 'you', 'you', 'you', 'for', 'to', '<s>', 'to', '’', 'to', '.', 'to', ',', 'for', ',', 'you', 'you', '<s>', '’', 'to', 'you', '.', '.', ',', 'you', '<s>', '<s>', 'to', 'you', '’', '’', 'you', 'for', '.', '<s>', '.', '<s>', 'to', 'to', '.', '<s>', '<s>', '<s>', '<s>', '<s>', '.', '.', 'to', 'to', '</s>', '</s>', '</s>', 'to', '</s>', 'to', 'to', '<s>', '</s>', '<s>', '<s>', '.', 'to', 'you', ',', '<s>', 'to', '<s>', 'to', 'to', 'to', 'you', 'to', '.', '.', '.', '’', 'to', '<s>', 'you', '<s>', 'to', 'to', '.', '.', '.', '<s>', '<s>', '<s>', '.', 'you']


  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
class Subreddit_Corpus:
    def __init__(self, subreddit=None, retrieval_limit=None, bfs_depth=3, judgement_categories=None):
        self.bfs_depth = bfs_depth
        self.judgement_categories = judgement_categories
        self.subreddit = subreddit
        self.corpus = []
        self.submissions = []
        
        for submission in self.subreddit.new(limit=retrieval_limit):
            self.submissions.append(Submission_Corpus(submission, bfs_depth=bfs_depth, judgement_categories=judgement_categories))
            self.submissions[-1]
    
    def extract_submission(self, submission):
        sub = Submission_Corpus(submission, bfs_depth=self.bfs_depth, judgement_categories=self.judgement_categories)

In [11]:

SS = Subreddit_Corpus(subreddit=subreddit, client_id=client_id, client_secret=client_secret, retrieval_limit=5, judgement_categories=judgement_categories)