In [1]:
from pprint import pprint
import praw
from praw.models import MoreComments
import requests
import json
import numpy as np
import pandas as pd
from collections import Counter
from math import ceil
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams

import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import word_tokenize

from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

from math import floor

In [2]:
client_id = '71ZX5Cupn2Ohpg'
client_secret = 'nzCz5_WlQM4LbJxX-t_3m-tPgZw'

reddit = praw.Reddit(user_agent='Comment Extraction',client_id=client_id, client_secret=client_secret)
subreddit = reddit.subreddit('AmITheAsshole')

judgement_categories = ['YTA', 'NTA', 'ESH', 'NAH', 'INFO']

In [47]:
class Submission_Corpus:
    def __init__(self, submission, bfs_depth=2, judgement_categories=None, judgement_weight='upvotes'):
        self.comment_meta = []
        self.judgement_categories = judgement_categories
        self.judgements = {}
        for category in self.judgement_categories:
            self.judgements[category] = []

        self.submission = submission
        self.original_post = submission.selftext
        self.comment_forrest = self.submission.comments
        self.comment_bfs(bfs_depth=bfs_depth)       
        
    def comment_bfs(self, bfs_depth=0):
        # Initialize queue to hold comment sub trees during BFS.
        bfs_queue = []
        
        # Populate queue with first level of comments.
        for comment in self.comment_forrest:
            bfs_queue.append(comment)
            
        current_level_size = len(bfs_queue)
        next_level_size = 0
        level_count = 0
        
        while (len(bfs_queue) > 0) and (level_count < bfs_depth):
            comment = bfs_queue.pop(0)
            current_level_size -= 1
            next_level_size += 1
            
            comment_features = None
            try:
                comment_features = self.extract_comment(comment)
            except:
                pass

            if comment_features is not None:
                self.comment_meta.append(comment_features)
                for reply in comment.replies:
                    bfs_queue.append(reply)
            
            if current_level_size == 0:
                current_level_size = next_level_size
                level_count += 1
            
    def extract_comment(self, comment, judgement_extraction_method='prefix', judgement_weighting='upvotes'):
        judgement = self.extract_judgement(comment.body, extraction_method=judgement_extraction_method)
        score = comment.score if judgement_weighting=='upvotes' else 1
        self.judgements[judgement].append(score)
        body = self.tokenize_comment(comment.body)
        comment_features = {
            'id' : comment.id,
            'author': comment.author,
            'body': body,
            'score' : score,
            'judgement': judgement
        }
        return(comment_features)
    
    def extract_judgement(self, txt, extraction_method='prefix'):
        if extraction_method == 'prefix':
            for category in self.judgement_categories:
                if txt[:len(category)] == category:
                    return(category)
    
    def summarize_judgement(self):
        total_judgements = sum([sum(count) for count in self.judgements.values()])
        judgement_summary = [(category, sum(count)/total_judgements) for category, count in self.judgements.items()]
        return(judgement_summary)
    
    def get_judgement_summary(self):
        return(self.judgement_summary)
    
    def tokenize_sentence(self, sent):
        try:
            tokenized_sent = word_tokenize(sent)
        except:
            tokenized_sent = []
        tokenized_sent.insert(0, '<s>')
        tokenized_sent.append('</s>')
        return(tokenized_sent)
    
    def tokenize_comment(self, comment_txt):
        sentences = sent_detector.tokenize(comment_txt.strip())
        tokenized_sentences = [self.tokenize_sentence(sent) for sent in sentences]
        tokenized_comment = ['<c>']
        for sent in tokenized_sentences:
            tokenized_comment += sent
        tokenized_comment += ['</c>']
        return(tokenized_comment)
        
    def get_commentCorpus(self):
        comments_by_category = {}
        for category in self.judgement_categories:
            comments_by_category[category] = []
        
        for comment in self.comment_meta:
            for i in range(max(5,floor(comment['score']/50))):
                comments_by_category[comment['judgement']].append(comment['body'])
                
        return(comments_by_category)

In [48]:
SC = Submission_Corpus(reddit.submission(id='d8gkv5'), bfs_depth=4, judgement_categories=judgement_categories)

In [49]:
SC.summarize_judgement()

[('YTA', 0.9108309281482607),
 ('NTA', 0.002126689958985265),
 ('ESH', 0.0),
 ('NAH', 0.08673856904147045),
 ('INFO', 0.0003038128512836093)]

In [50]:
cc = SC.get_commentCorpus()

In [25]:
def ngram_dict(corpus):
    ngrams = list(ngrams(corpus, 3))
    

In [26]:
# create_bigramMap(corpus) returns a dictionary of dictionaries counting the occurences of bigrams from the 
# sentences of corpus. 

# THIS IS NOT A MEMORY EFFICIENT SOLUTION. For the purposes of this assignment, I have chosen to use python 
# dictionaries, which provide fast lookup at the cost of more memory (they are essentially hash maps).

# In practice, I would write a class (and corresponding functions) to would build a prefix tree. A prefix-tree
# would have O(lgn) lookup (compared to a hash table with O(1) lookup), but would use far less memory, 
# especially if the dimensionality of the n-grams were to be expanded.

# - The main dictionary has a '< count >' key which maps to the total number of bigrams in 
#   the corpus; every other key in the main dictionary maps to a dictionary of bigrams starting with the given 
#   key.

# - Each sub-dictionary has a '< count >' key which maps to the count of bigrams starting with the given 
#   main key; every other key in the sub-dictionary maps to the count of bigrams starting with the main key, 
#   and endind in the corresponding sub key. 

def create_bigramMap(corpus):
    bigramMap = {'<unk>': {'< count >': 1},}
    sentences = corpus
    num_sentences = len(sentences)
    
    # Iterate over sentences in corpus.
    for sentence in sentences:
        num_words = len(sentence)
        
        # Iterate over words in sentence
        for index, word in enumerate(sentence):
            
            # The a sentence terminator is counted as a word, but it should not be used as the first word of a 
            # bigram. Check that the current word index is not the end of the sentence.
            if(index < num_words-1):
                second_word = sentence[index+1]
                
                # If the current word already exists in the key space of the main dictionary ...
                if word in bigramMap.keys():
                    
                    # ... and the second word exists in the key space of the sub-dictionary corresponding to 
                    # the first word, then increment the count of bigrams containing the first and second word,
                    # as well as the count of all bigrams containing the first word.
                    if second_word in bigramMap[word].keys():
                        bigramMap[word][second_word]+= 1
                        bigramMap[word]['< count >'] += 1
                        
                    # ... but the second word does not exist in the key space of the dictionary corresponding to
                    # the first word, then add the second word to the key space of the first word's dictionary,
                    # a set it's count to 2 (this is for add-1 smoothing)
                    else:
                        bigramMap[word][second_word] = 2
                        bigramMap[word]['< count >'] += 2 
                        
                # If the first word does not exist in the key space of the main dictionary, add the first word
                # to the key space of the main dictionary, and map it to a new dictionary with entries for count, 
                # unk, and the second word. (Once again, second word is initialized with a count of 2 to apply 
                # smoothing)
                else:
                    bigramMap[word] = {'< count >': 3, 
                                       second_word: 2, #add-1 smoothing
                                       '<unk>': 1
                                      }
    # Calculate the total number of bigrams in bigramMap.
    bigramMap['< count >'] = sum([val['< count >'] for key, val in bigramMap.items()])
    
    return(bigramMap)

In [41]:
corpus = cc['YTA']
bm = create_bigramMap(corpus)

In [28]:
def biProb(bigram):
    try:
        first_count = bigramMap[bigram[0]]['< count >']
    except:
        return(1/bigramMap['< count >'])
        
    try:
        second_count = bigramMap[bigram[1]]['< count >']
    except:
        return(1/first_count)
    
    return(second_count/first_count)

In [29]:
def bi_sentenceProb(words):
    len_words = len(words)
    prod = 1
    for index, word in enumerate(words):
        if index < len_words-1:
            prod *= biProb((word, words[index+1]))
    return(prod)

In [30]:
def sample_word(wordMap, mode=0):
    uniProb = []
    uniWord = []
    total = wordMap['< count >']
    for key, val in wordMap.items():
        if key != '< count >':
            uniWord.append(key)
            
            if mode == 0:
                uniProb.append(val/total)
            else:
                uniProb.append(val['< count >']/total)
    
    return(np.random.choice(uniWord, p=uniProb))

In [52]:
def bigram_generateSentence(bigramMap):
    first_word = '<c>'
    sentence = []
    while first_word != '</c>':
        sentence.append(first_word)
        if first_word != '<unk>':
            first_word = sample_word(bigramMap[first_word])
        else:
            first_word = sample_word(bigramMap[first_word], mode=1)
            
    sentence.append('</c>')
    print(sentence)

In [53]:
for i in range(10):
    bigram_generateSentence(bm)

ValueError: 'a' cannot be empty unless no samples are taken

In [10]:
class Subreddit_Corpus:
    def __init__(self, subreddit=None, retrieval_limit=None, bfs_depth=3, judgement_categories=None):
        self.bfs_depth = bfs_depth
        self.judgement_categories = judgement_categories
        self.subreddit = subreddit
        self.corpus = []
        self.submissions = []
        
        for submission in self.subreddit.new(limit=retrieval_limit):
            self.submissions.append(Submission_Corpus(submission, bfs_depth=bfs_depth, judgement_categories=judgement_categories))
            self.submissions[-1]
    
    def extract_submission(self, submission):
        sub = Submission_Corpus(submission, bfs_depth=self.bfs_depth, judgement_categories=self.judgement_categories)

In [11]:

SS = Subreddit_Corpus(subreddit=subreddit, client_id=client_id, client_secret=client_secret, retrieval_limit=5, judgement_categories=judgement_categories)