In [2]:
from pprint import pprint
import praw
from praw.models import MoreComments
import requests
import json
import numpy as np
import pandas as pd
from collections import Counter
from math import ceil
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams

import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import word_tokenize

from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

from math import floor
import copy

In [3]:
client_id = '71ZX5Cupn2Ohpg'
client_secret = 'nzCz5_WlQM4LbJxX-t_3m-tPgZw'

reddit = praw.Reddit(user_agent='Comment Extraction',client_id=client_id, client_secret=client_secret)
subreddit = reddit.subreddit('AmITheAsshole')

judgement_categories = ['YTA', 'NTA', 'ESH', 'NAH', 'INFO']

In [4]:
class Submission_Corpus:
    def __init__(self, submission, bfs_depth=2, judgement_categories=None, judgement_weight='upvotes'):
        self.comment_meta = []
        self.judgement_categories = judgement_categories
        self.judgement_weight = judgement_weight
        self.judgements = {}
        for category in self.judgement_categories:
            self.judgements[category] = []

        self.submission = submission
        self.original_post = submission.selftext
        self.comment_forrest = self.submission.comments
        self.comment_bfs(bfs_depth=bfs_depth)       
        
    def comment_bfs(self, bfs_depth=0):
        # Initialize queue to hold comment sub trees during BFS.
        bfs_queue = []
        
        # Populate queue with first level of comments.
        for comment in self.comment_forrest:
            bfs_queue.append(comment)
            
        current_level_size = len(bfs_queue)
        next_level_size = 0
        level_count = 0
        
        while (len(bfs_queue) > 0) and (level_count < bfs_depth):
            comment = bfs_queue.pop(0)
            current_level_size -= 1
            next_level_size += 1
            
            comment_features = None
            try:
                comment_features = self.extract_comment(comment)
            except:
                pass

            if comment_features is not None:
                self.comment_meta.append(comment_features)
                for reply in comment.replies:
                    bfs_queue.append(reply)
            
            if current_level_size == 0:
                current_level_size = next_level_size
                level_count += 1
            
    def extract_comment(self, comment, judgement_extraction_method='prefix', judgement_weighting='upvotes'):
        judgement = self.extract_judgement(comment.body, extraction_method=judgement_extraction_method)
        score = comment.score if judgement_weighting=='upvotes' else 1
        self.judgements[judgement].append(score)
        body = self.tokenize_comment(comment.body)
        comment_features = {
            'id' : comment.id,
            'author': comment.author,
            'body': body,
            'score' : score,
            'judgement': judgement
        }
        return(comment_features)
    
    def extract_judgement(self, txt, extraction_method='prefix'):
        if extraction_method == 'prefix':
            for category in self.judgement_categories:
                if txt[:len(category)] == category:
                    return(category)
    
    def summarize_judgement(self):
        total_judgements = sum([sum(count) for count in self.judgements.values()])
        judgement_summary = [(category, sum(count)/total_judgements) for category, count in self.judgements.items()]
        return(judgement_summary)
    
    def get_judgement_summary(self):
        return(self.judgement_summary)
    
    def tokenize_sentence(self, sent):
        try:
            tokenized_sent = word_tokenize(sent)
        except:
            tokenized_sent = []
        tokenized_sent.insert(0, '<s>')
        tokenized_sent.append('</s>')
        return(tokenized_sent)
    
    def tokenize_comment(self, comment_txt):
        sentences = sent_detector.tokenize(comment_txt.strip())
        tokenized_sentences = [self.tokenize_sentence(sent) for sent in sentences]
        tokenized_comment = ['<c>']
        for sent in tokenized_sentences:
            tokenized_comment += sent
        tokenized_comment += ['</c>']
        return(tokenized_comment)
        
    def get_commentCorpus(self):
        comments_by_category = {}
        for category in self.judgement_categories:
            comments_by_category[category] = []
        
        
        for comment in self.comment_meta:
            if self.judgement_weight == 'upvotes':
                for i in range(max(5,floor(comment['score']/50))):
                    comments_by_category[comment['judgement']].append(comment['body'])
                    
            else:
                comments_by_category[comment['judgement']].append(comment['body'])            
                
        return(comments_by_category)
    
    #def get_weightedCorpus(self):
        

In [5]:
SC = Submission_Corpus(reddit.submission(id='dv9ogm'), bfs_depth=1, judgement_categories=judgement_categories, judgement_weight='none')

In [6]:
SC.summarize_judgement()

[('YTA', 0.996140231962645),
 ('NTA', 0.0),
 ('ESH', 0.0),
 ('NAH', 0.0),
 ('INFO', 0.0038597680373550236)]

In [7]:
cc = SC.get_commentCorpus()

In [30]:
print(cc['YTA'][0])

[[-5.64015657e-03 -4.50015505e-04  1.53206079e-03 ...  3.43086594e-03
   3.60768614e-03  4.01649484e-03]
 [-3.19504808e-03 -8.14515166e-03 -5.08469006e-04 ...  7.40202551e-04
  -1.55743340e-03  2.72040116e-03]
 [-1.49010809e-03  1.00687763e-03 -3.07938620e-03 ... -8.15559761e-05
   2.66470917e-04  2.20775069e-03]
 ...
 [ 7.72689018e-05 -5.96411992e-03 -6.78065140e-03 ...  5.32421935e-03
  -1.03778215e-02  2.74631870e-03]
 [-2.68363557e-03 -9.71419737e-03 -1.70469098e-03 ... -1.47615839e-03
  -2.59719999e-03 -1.61064672e-03]
 [-1.91855163e-03 -6.73846516e-05 -5.53495833e-04 ...  4.35119146e-04
  -9.64972132e-04 -2.16196384e-03]]


In [8]:
for i in ngrams(cc['NTA'][0],3):
    print(i)

('<c>', '<s>', 'NTA')
('<s>', 'NTA', '.')
('NTA', '.', '</s>')
('.', '</s>', '<s>')
('</s>', '<s>', 'You')
('<s>', 'You', 'were')
('You', 'were', 'uncomfortable')
('were', 'uncomfortable', 'with')
('uncomfortable', 'with', 'some')
('with', 'some', 'behaviors')
('some', 'behaviors', ',')
('behaviors', ',', 'you')
(',', 'you', 'addressed')
('you', 'addressed', 'it')
('addressed', 'it', 'directly')
('it', 'directly', '.')
('directly', '.', '</s>')
('.', '</s>', '<s>')
('</s>', '<s>', 'If')
('<s>', 'If', 'he')
('If', 'he', 'wants')
('he', 'wants', 'to')
('wants', 'to', 'respect')
('to', 'respect', 'your')
('respect', 'your', 'boundaries')
('your', 'boundaries', 'your')
('boundaries', 'your', 'friendship')
('your', 'friendship', 'can')
('friendship', 'can', 'move')
('can', 'move', 'on')
('move', 'on', ',')
('on', ',', 'if')
(',', 'if', 'not')
('if', 'not', 'the')
('not', 'the', 'responsibility')
('the', 'responsibility', 'is')
('responsibility', 'is', 'on')
('is', 'on', 'him')
('on', 'him',

In [8]:
def make_ngram_corpus(corpus, N=3):
    ngram_tups_list = list(ngrams(corpus, N))
    ngram_corpus = []
    for gi in ngram_tups_list:
        if not gi[0:N-1] in [x[0] for x in ngram_corpus]:
            next_word_dict = {}
            
            for gj in ngram_tups_list:
                if gj[0:N-1] == gi[0:N-1]:
                    if gj[N-1] in next_word_dict.keys():
                        next_word_dict[gj[N-1]] += 1
                    else:
                        next_word_dict[gj[N-1]] = 1
            gi_count = sum(next_word_dict.values())
            next_word_prob_tups = tuple([(key, value/gi_count) for key, value in next_word_dict.items()])
            ngram_corpus.append((gi[0:N-1], gi_count, next_word_prob_tups))
    return(ngram_corpus)
    

In [89]:
flat_comment_corpus = []
for comment in cc['NTA']:
    for token in comment:
        flat_comment_corpus.append(token)
        
ngram_corpus = make_ngram_corpus(flat_comment_corpus,3)

In [9]:
def ngram_generate_comment(corpus, N):
    comment_starts = []
    for gi  in corpus:
        if gi[0][0] == '<c>':
            comment_starts.append(gi)
    num_starts = sum([gi[1] for gi in comment_starts])
    probs=[gi[1]/num_starts for gi in comment_starts]
    comment_start = comment_starts[np.random.choice(list(range(len(comment_starts))), p=probs)]
    comment = [w for w in comment_start[0]]
    while comment[-1] != '</c>':
        prev_gram = tuple(comment[-(N-1):])
        nnext_choices = None
        for gi in corpus:
            if gi[0] == prev_gram:
                next_choices = gi[2]
        if next_choices is not None:
            next_probs = [x[1] for x in next_choices]
            next_word = next_choices[np.random.choice(list(range(len(next_choices))),p=next_probs)][0]
            comment.append(next_word)
        else:
            print('N-Gram not found in corpus')
    return(comment)

In [91]:
for i in range(10):
    comment = ' '.join(ngram_generate_comment(ngram_corpus, 3))
    print(comment)

ValueError: 'a' cannot be empty unless no samples are taken

In [8]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, Dense, Activation
from gensim.models import Word2Vec

Using TensorFlow backend.


In [10]:
def build_rnn_corpus(cc, wv, N=10):
    all_comments = []
    all_words = []
    category_inds = np.arange(len(judgement_categories))
    np.random.shuffle(category_inds)

    for category_ind in category_inds:
        comments = cc[judgement_categories[category_ind]]
        comment_inds = np.arange(len(comments))
        np.random.shuffle(comment_inds)
        for comment_ind in comment_inds:
            all_comments.append(comments[comment_ind])
            for token in comments[comment_ind]:
                all_words.append(token)
                
    all_ngrams = ngrams(all_words,N)
    X_train = []
    y_train = []
    for ngram in all_ngrams:
        X_train.append([wv[token] for token in ngram[:-1]])
        y_train.append(wv[ngram[-1]])

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    return(X_train, y_train)

In [11]:
all_comments = []
all_words = []
category_inds = np.arange(len(judgement_categories))
np.random.shuffle(category_inds)

for category_ind in category_inds:
    comments = cc[judgement_categories[category_ind]]
    comment_inds = np.arange(len(comments))
    np.random.shuffle(comment_inds)
    for comment_ind in comment_inds:
        all_comments.append(comments[comment_ind])
        for token in comments[comment_ind]:
            all_words.append(token)

word2vec = Word2Vec(all_comments, min_count=1)            

NameError: name 'Word2Vec' is not defined

In [95]:
#all_ngrams = ngrams(all_words,5)
#X_train = []
#y_train = []
#for ngram in all_ngrams:
#    X_train.append([word2vec.wv[token] for token in ngram[:-1]])
#    y_train.append(word2vec.wv[ngram[-1]])
#    
#X_train = np.array(X_train)
#y_train = np.array(y_train)
#X_train = np.array([word2vec.wv[token] for token in all_words])
#y_train

In [96]:
#X_train, y_train = build_rnn_corpus(cc, word2vec.wv)

In [101]:
model = Sequential()
model.add(GRU(units=32, return_sequences=True))
model.add(Dense(100))
model.compile(optimizer='adam', loss='mean_squared_error')

In [102]:
X_train, y_train = build_rnn_corpus(cc, word2vec.wv)
model.fit(X_train[:,:,:], y_train[:,:], shuffle=False, batch_size=16, epochs=1)
    

ValueError: Error when checking target: expected dense_14 to have 3 dimensions, but got array with shape (3649, 100)

In [99]:
y_pred = model.predict(X_train[:0,:,:])

In [100]:
s = [word2vec.wv.similar_by_vector(X_train[0,i,:], topn=1)[0][0] for i in range(X_train.shape[1])]
print(y_pred)
for pred in y_pred:
    print(word2vec.wv.similar_by_vector(pred, topn=1)[0][0])
    s.append(word2vec.wv.similar_by_vector(pred, topn=1)[0][0])
print(s)


[]
['<c>', '<s>', 'INFO', '>', 'My', 'husbands', 'parents', 'are', 'sympathetic']


In [9]:
import tensorflow as tf
import numpy as np
import keras.backend as K
from tqdm import tqdm
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dense, GRU, TimeDistributed, AveragePooling1D, Flatten
    
class Comment_GRU(object):
    def __init__(self, judgements, comment_corpus, N=4, batch_size=16, epochs=10):    
        self.judgements = judgements
        self.comment_corpus = comment_corpus
        self.w2v = self.build_word2vec()
        self.vectorize_corpus()
        self.N = N
        self.vocab_size = len(self.w2v.wv.vocab)
        self.embed_size = 100
        self.seed = None
        self.batch_size = batch_size
        self.epochs = epochs
        
        self.model = Sequential()
        self.model.add(GRU(units=32, batch_input_shape=(self.batch_size, self.N-1, 100), stateful=True, return_sequences=True))
        self.model.add(TimeDistributed(Dense(1)))
        #self.model.add(AveragePooling1D())
        self.model.add(Flatten())
        self.model.add(Dense(100))
        self.model.compile(optimizer='adam', loss='mean_squared_error')
        
        for epoch in range(self.epochs):
            print('Epoch: {}'.format(epoch+1))
            Xc,yc = self.build_dataset() 
            X = np.concatenate(list(filter(lambda vals: True if vals.shape[0]>0 else False, Xc.values())),axis=0)
            y = np.concatenate(list(filter(lambda vals: True if vals.shape[0]>0 else False, yc.values())),axis=0)
            for i in range(int(X.shape[0]/self.batch_size)):
                self.model.train_on_batch(X[self.batch_size*i:self.batch_size*(i+1),:,:], y[self.batch_size*i:self.batch_size*(i+1),:])
    
    def apply_batch_windowing(self, c):
        num_subsequences = (c.shape[0]-self.N)+1
        start_row = 0 if num_subsequences > self.batch_size else self.batch_size - num_subsequences
        windowed_c = np.zeros((self.batch_size, self.N-1, 100))
        for i in range(start_row,self.batch_size):
            windowed_c[i,:,:] = c[i:i+self.N-1,:]
        
        return(windowed_c)
    
    def commentMatrix_to_commentSentence(self, comment_mat):
        comment_wordVecs = [comment_mat[0,i,:] for i in range(comment_mat.shape[1]-1)]
        for subsequence_ind in range(comment_mat.shape[0]):
            comment_wordVecs.append(comment_mat[subsequence_ind,-1,:])
        comment_words = [self.w2v.wv.similar_by_vector(wordVec, topn=1)[0][0] for wordVec in comment_wordVecs]
        print(comment_words)
            
    
    def generate_comment(self, judgement, max_len):
        #num_subsequences = (self.comment_corpus[judgement][0].shape[0]-self.N)+1
        #start_row = 0 if num_subsequences > self.batch_size else self.batch_size - num_subsequences
        #seed = np.zeros((self.batch_size, self.N-1, 100))
        #for i in range(start_row,self.batch_size):
        #    seed[i,:,:] = self.comment_corpus[judgement][0][:self.N-1,:]
        
        seed = self.apply_batch_windowing(self.comment_corpus[judgement][0])
        #seed.reshape
        #seed = self.comment_corpus[judgement][0][:self.N-1,:].reshape((1,self.N-1,100))
        #seed = np.array([self.comment_corpus[judgement][0][0,:]]).reshape((1,1,100))
        comment = None
        for i in range(max_len):
            self.commentMatrix_to_commentSentence(seed)
            #print(seed.shape)
            next_subsequence = self.model.predict(seed)
            #print(next_word.shape)
            if comment is None:
                comment = next_subsequence[1:,:].reshape((1,self.N-1,100))
            else:
                comment = np.concatenate((comment,next_subsequence[1:,:].reshape((1,self.N-1,100))), axis=0)
                #print(comment.shape)
            seed = np.concatenate((seed[1:,:,:].reshape((self.batch_size-1,self.N-1,100)),comment[-1,:,:].reshape((1,self.N-1,100))), axis=0).reshape(self.batch_size, self.N-1, 100)
            
        self.commentMatrix_to_commentSentence(comment)
            
    # TO DO: Set p to comment upvote score to change sampling distribution with respect to comment popularity.
    #def build_batch(self, p=None):
        
        
    def build_dataset(self):
        # Use np.random.permutation to create a new ordering of comments for each judgement category. 
        # self.comment_corpus.values() gives the comments for each category, i.e. each element of the list 
        # returned by .values() is the list of comments for a particular category, and using len gives the 
        # number of comments for each category. The function np.arrange creates the default ordering 
        # (0,1,...len(vals)-1), and np.random.permutation gives a new permuted ordering.
        shuffled_comment_inds = list(map(lambda vals: np.random.permutation(np.arange(len(vals))), self.comment_corpus.values()))
        
        # Zip the new comment orderings back to the judgement category labels.
        category_comment_dict = dict(zip(self.comment_corpus.keys(), shuffled_comment_inds))
        
        X = {}
        y = {}
        for category_name, category_comment_inds in category_comment_dict.items():
            X[category_name] = []
            y[category_name] = []
            for comment_ind in category_comment_inds:
                num_subsequences = self.comment_corpus[category_name][comment_ind].shape[0]-self.N
                for i in range(num_subsequences):
                    X[category_name].append(self.comment_corpus[category_name][comment_ind][i:i+self.N-1,:])
                    y[category_name].append(self.comment_corpus[category_name][comment_ind][i+self.N,:])
            X[category_name] = np.array(X[category_name])
            y[category_name] = np.array(y[category_name])
        return(X,y)
    
    def build_word2vec(self):
        all_comments = []
        all_words = []
        category_inds = np.arange(len(self.judgements))
        for category_ind in category_inds:
            for comment_ind in np.arange(len(self.comment_corpus[self.judgements[category_ind]])):
                all_comments.append(self.comment_corpus[self.judgements[category_ind]][comment_ind])
                #for token in self.comment_corpus[self.judgements[category_ind]][comment_ind]:
                #    all_words.append(token)

        return(Word2Vec(all_comments, min_count=1))
        
    def vectorize_corpus(self):
        for category_ind in np.arange(len(self.judgements)):
            comment_inds = np.arange(len(self.comment_corpus[self.judgements[category_ind]]))
            
            for comment_ind in comment_inds:
                token_inds = np.arange(len(self.comment_corpus[self.judgements[category_ind]][comment_ind]))
                
                for token_ind in token_inds:
                    self.comment_corpus[self.judgements[category_ind]][comment_ind][token_ind] = self.w2v.wv[self.comment_corpus[self.judgements[category_ind]][comment_ind][token_ind]]
                
                self.comment_corpus[self.judgements[category_ind]][comment_ind] = np.array(self.comment_corpus[self.judgements[category_ind]][comment_ind])


In [14]:
cc_copy = copy.deepcopy(cc)
c_gru = Comment_GRU(judgement_categories, cc_copy, N=32, batch_size=32, epochs=100 )

Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20
Epoch: 21
Epoch: 22
Epoch: 23
Epoch: 24
Epoch: 25
Epoch: 26
Epoch: 27
Epoch: 28
Epoch: 29
Epoch: 30
Epoch: 31
Epoch: 32
Epoch: 33
Epoch: 34
Epoch: 35
Epoch: 36
Epoch: 37
Epoch: 38
Epoch: 39
Epoch: 40
Epoch: 41
Epoch: 42
Epoch: 43
Epoch: 44
Epoch: 45
Epoch: 46
Epoch: 47
Epoch: 48
Epoch: 49
Epoch: 50
Epoch: 51
Epoch: 52
Epoch: 53
Epoch: 54
Epoch: 55
Epoch: 56
Epoch: 57
Epoch: 58
Epoch: 59
Epoch: 60
Epoch: 61
Epoch: 62
Epoch: 63
Epoch: 64
Epoch: 65
Epoch: 66
Epoch: 67
Epoch: 68
Epoch: 69
Epoch: 70
Epoch: 71
Epoch: 72
Epoch: 73
Epoch: 74
Epoch: 75
Epoch: 76
Epoch: 77
Epoch: 78
Epoch: 79
Epoch: 80
Epoch: 81
Epoch: 82
Epoch: 83
Epoch: 84
Epoch: 85
Epoch: 86
Epoch: 87
Epoch: 88
Epoch: 89
Epoch: 90
Epoch: 91
Epoch: 92
Epoch: 93
Epoch: 94
Epoch: 95
Epoch: 96
Epoch: 97
Epoch: 98
Epoch: 99
Epoch: 100


In [15]:
c_gru.generate_comment('YTA', 20)

['<c>', '<s>', 'YTA', '.', '</s>', '<s>', 'I', '’', 'm', 'trying', 'to', 'say', 'that', 'as', 'kindly', 'as', 'possible', 'but', 'why', 'would', 'you', 'think', 'someone', 'who', 'is', 'adamantly', 'against', 'having', 'children', 'would', 'want', 'to', 'carry', 'yours', '?', '</s>', '<s>', 'If', 'your', 'husband', 'is', 'that', 'close', 'to', 'his', 'sister', ',', 'I', 'would', 'have', 'thought', 'that', 'he', 'would', 'know', 'better', 'than', 'to', 'make', 'such', 'a', 'tone']
['<s>', 'YTA', '.', '</s>', '<s>', 'I', '’', 'm', 'trying', 'to', 'say', 'that', 'as', 'kindly', 'as', 'possible', 'but', 'why', 'would', 'you', 'think', 'someone', 'who', 'is', 'adamantly', 'against', 'having', 'children', 'would', 'want', 'to', 'carry', 'yours', '?', '</s>', '<s>', 'If', 'your', 'husband', 'is', 'that', 'close', 'to', 'his', 'sister', ',', 'I', 'would', 'have', 'thought', 'that', 'he', 'would', 'know', 'better', 'than', 'to', 'make', 'such', 'a', 'tone', '</s>']
['YTA', '.', '</s>', '<s>', '

In [76]:
class Subreddit_Corpus:
    def __init__(self, subreddit=None, retrieval_limit=None, bfs_depth=3, judgement_categories=None):
        self.bfs_depth = bfs_depth
        self.judgement_categories = judgement_categories
        self.subreddit = subreddit
        self.corpus = []
        self.submissions = []
        
        for submission in self.subreddit.new(limit=retrieval_limit):
            self.submissions.append(Submission_Corpus(submission, bfs_depth=bfs_depth, judgement_categories=judgement_categories))
            self.submissions[-1]
    
    def extract_submission(self, submission):
        sub = Submission_Corpus(submission, bfs_depth=self.bfs_depth, judgement_categories=self.judgement_categories)

In [11]:

SS = Subreddit_Corpus(subreddit=subreddit, client_id=client_id, client_secret=client_secret, retrieval_limit=5, judgement_categories=judgement_categories)