In [1]:
#### IMPORTS #####
from string import punctuation
from collections import deque
from datetime import timedelta
import ujson
import pandas as pd
from datetime import datetime
# import praw
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import argparse
import gzip
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import time
import numpy as np
import bz2


In [2]:
BANNED_LOC = "Data/banned_reddits.txt"
banned_dict = {}
with open(BANNED_LOC) as f:
    for l in f:
        banned_dict[l.strip()[2:]] = True

In [4]:
# Read in the ~14000 emotive words that have been rated. Generate a dictionary for reference.
EMOT_LOC = "Data/BRM-emot-submit.csv"
emote_df = pd.read_csv(EMOT_LOC)
emot_arr = emote_df[["Word", "V.Mean.Sum"]].values

emot_dict = {}
for word, value in emot_arr:
    emot_dict[word] = value
    
def clean_text(text):
    exclude = set(punctuation) # Keep a set of "bad" characters.
    list_letters_noPunct = [ char for char in text if char not in exclude ]
    
    # Now we have a list of LETTERS, *join* them back together to get words:
    text_noPunct =  "".join(list_letters_noPunct)

    # Split this big string into a list of words:
    list_words = text_noPunct.strip().split()
    
    # Convert to lower-case letters:
    list_words = [ word.lower() for word in list_words ]
    return list_words

def get_emotion(word_list):
    vals = [emot_dict[word] for word in word_list if word in emot_dict]
    if len(vals) == 0:
        return np.nan, np.nan
    else:
        return np.mean(np.array(vals)), np.sum(np.array(vals))
    
print("Emotive ratings for sample words.")


for word in [ "dead","cry", "party",  "born", "sleep"]:
    print(word, emot_dict[word])
    


Emotive ratings for sample words.
dead 2.02
cry 3.22
party 7.18
born 7.33
sleep 7.22


In [6]:
t0 = time.time()

# make a queue much faster than list for adding large amounts of items.
q = deque()

# Read in JSON data
NUM_POSTS_TO_READ = 2e5
MSG_PREV = 30

with bz2.open("Data/RC_2016-10.bz2") as f:
    num_read = 0
    for line in f:
        num_read += 1
        if num_read > NUM_POSTS_TO_READ: break
        post = ujson.loads(line)
        post["banned"] = True if post["subreddit"] in banned_dict else False
        cleaned_text = clean_text(post["body"])
        emot_avg, emot_sum  = get_emotion(cleaned_text)
        post["emotionAvg"] = emot_avg 
        post["emotionSum"] = emot_sum
        post["bodySan"] = ' '.join(cleaned_text)
        q.append(post)

df = pd.DataFrame(q)
t1 = time.time()
print("took: %.2f"%(t1-t0))

took: 21.49


In [7]:
# print the dataframe
df[["subreddit", "author", "body","bodySan", "banned", "emotionAvg", "emotionSum"]]

tmpBanned = df.iloc[np.where(df["banned"])]
tmpNotBanned = df.iloc[np.where(df["banned"] != True)]

print("B  posts have avg per post emotion of mean:    %4.2f, std: %4.2f"%(np.nanmean(tmpBanned["emotionAvg"]), np.nanstd(tmpBanned["emotionAvg"]) ))
print("NB posts have avg per post emotion of mean:    %4.2f, std: %4.2f"%(np.nanmean(tmpNotBanned["emotionAvg"]), np.nanstd(tmpNotBanned["emotionAvg"]) ))

print("B  posts have summed per post emotion of mean: %4.2f, std: %4.2f"%(np.nanmean(tmpBanned["emotionSum"]), np.nanstd(tmpBanned["emotionSum"])))
print("NB posts have summed per post emotion of mean: %4.2f, std: %4.2f"%(np.nanmean(tmpNotBanned["emotionSum"]), np.nanstd(tmpNotBanned["emotionSum"])))
    
    

B  posts have avg per post emotion of mean:    5.64, std: 0.84
NB posts have avg per post emotion of mean:    5.77, std: 0.80
B  posts have summed per post emotion of mean: 43.21, std: 56.67
NB posts have summed per post emotion of mean: 51.71, std: 85.05


In [8]:
df[["subreddit", "author", "body","bodySan", "banned", "emotionAvg", "emotionSum"]]

tmpBanned = df.iloc[np.where(df["banned"])]
tmpNotBanned = df.iloc[np.where(df["banned"] != True)]
for row in tmpBanned[["subreddit", "author", "emotionAvg",  "body",]].values:
    if row[-1] == "[deleted]" or row[-1] == "[removed]":
        continue
    print ("%-20s"%row[0], "%-20s"%row[1], "%4.2f"%row[2], "  %-100s"%repr(row[-1][:100]))

watchpeopledie       Rifiuto              7.73   'Thanks!'                                                                                           
DebateFascism        123456789012345a     4.38   "So it's not a Jewish problem.  It is a rich vs poor problem."                                      
RCSources            Neurobomb            6.23   'I got the same thing. \n\nI think it may be Flub. And strong stuff at that. \n\nI was offered a reship o'
CringeAnarchy        kaythebae            7.33   'You first :) Xx\n'                                                                                 
CringeAnarchy        Whatthefuckamisaying 6.94   '3.WOW! Great moves! Keep it up, proud of you!'                                                     
uncensorednews       Remi_Autor           5.51   'In order to maintain control.'                                                                     
sjwhate              weebee980            5.56   "It's not my fault I'm royalty born in a peas