In [1]:
#### IMPORTS #####
from string import punctuation
from collections import deque
from datetime import timedelta
import ujson
import pandas as pd
from datetime import datetime
# import praw
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import argparse
import gzip
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import time
import numpy as np
import bz2


In [2]:
BANNED_LOC = "Data/banned_reddits.txt"
banned_dict = {}
with open(BANNED_LOC) as f:
    for l in f:
        banned_dict[l.strip()[2:]] = True

In [3]:
# Read in the ~14000 emotive words that have been rated. Generate a dictionary for reference.
EMOT_LOC = "Data/BRM-emot-submit.csv"
emote_df = pd.read_csv(EMOT_LOC)
emot_arr = emote_df[["Word", "V.Mean.Sum"]].values

emot_dict = {}
for word, value in emot_arr:
    emot_dict[word] = value
    
def clean_text(text):
    exclude = set(punctuation) # Keep a set of "bad" characters.
    list_letters_noPunct = [ char for char in text if char not in exclude ]
    
    # Now we have a list of LETTERS, *join* them back together to get words:
    text_noPunct =  "".join(list_letters_noPunct)

    # Split this big string into a list of words:
    list_words = text_noPunct.strip().split()
    
    # Convert to lower-case letters:
    list_words = [ word.lower() for word in list_words ]
    return list_words

# old code.
#     # (http://docs.python.org/3/library/stdtypes.html#str.join)

#     punctuations = '''!()\-[]{};:'"\,<>./?@#$%^&*_~|''';
#     for char in punctuations:
#         text = text.replace(char, '')
#     text = text.replace('\n','') + ' '
#     text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
#     return text

def get_emotion(word_list):
    vals = [emot_dict[word] for word in word_list if word in emot_dict]
    if len(vals) == 0:
        return np.nan, np.nan
    else:
        return np.mean(np.array(vals)), np.sum(np.array(vals))
    
print("Emotive ratings for sample words.")


for word in [ "dead","cry", "party",  "born", "sleep"]:
    print(word, emot_dict[word])
    


Emotive ratings for sample words.
dead 2.02
cry 3.22
party 7.18
born 7.33
sleep 7.22


In [6]:
t0 = time.time()

# make a queue much faster than list for adding large amounts of items.
q = deque()

# Read in JSON data
NUM_POSTS_TO_READ = 2e5
MSG_PREV = 30

MESSAGES_LOC = "Data/RC_2019-01"


# header = ("%-23s | %-20s | %"+str(MSG_PREV+10)+"s")%("Subreddit", "User", "Message")
# print(header)
# print("-"*len(header))
# with open(MESSAGES_LOC, "r") as f:
with bz2.open("/Users/David/Downloads/RC_2015-01.bz2") as f:
    num_read = 0
    for line in f:
        num_read += 1
        if num_read > NUM_POSTS_TO_READ: break
        post = ujson.loads(line)
        post["banned"] = True if post["subreddit"] in banned_dict else False
        cleaned_text = clean_text(post["body"])
        emot_avg, emot_sum  = get_emotion(cleaned_text)
        post["emotionAvg"] = emot_avg 
        post["emotionSum"] = emot_sum
        post["bodySan"] = ' '.join(cleaned_text)
        q.append(post)
#         print(("/r/%-20s | %-20s | %"+str(MSG_PREV+10)+"s")%( post['subreddit'], post['author'], repr(post['body'][:MSG_PREV]))) #post['created_utc']

df = pd.DataFrame(q)
t1 = time.time()
print("took: %.2f"%(t1-t0))

took: 21.10


In [7]:
# print the dataframe
df[["subreddit", "author", "body","bodySan", "banned", "emotionAvg", "emotionSum"]]

tmpBanned = df.iloc[np.where(df["banned"])]
tmpNotBanned = df.iloc[np.where(df["banned"] != True)]

print("B  posts have avg per post emotion of mean:    %4.2f, std: %4.2f"%(np.nanmean(tmpBanned["emotionAvg"]), np.nanstd(tmpBanned["emotionAvg"]) ))
print("NB posts have avg per post emotion of mean:    %4.2f, std: %4.2f"%(np.nanmean(tmpNotBanned["emotionAvg"]), np.nanstd(tmpNotBanned["emotionAvg"]) ))

print("B  posts have summed per post emotion of mean: %4.2f, std: %4.2f"%(np.nanmean(tmpBanned["emotionSum"]), np.nanstd(tmpBanned["emotionSum"])))
print("NB posts have summed per post emotion of mean: %4.2f, std: %4.2f"%(np.nanmean(tmpNotBanned["emotionSum"]), np.nanstd(tmpNotBanned["emotionSum"])))
    
    

B  posts have avg per post emotion of mean:    5.65, std: 0.84
NB posts have avg per post emotion of mean:    5.82, std: 0.81
B  posts have summed per post emotion of mean: 47.63, std: 59.62
NB posts have summed per post emotion of mean: 51.19, std: 88.73


In [8]:
df[["subreddit", "author", "body","bodySan", "banned", "emotionAvg", "emotionSum"]]

tmpBanned = df.iloc[np.where(df["banned"])]
tmpNotBanned = df.iloc[np.where(df["banned"] != True)]
for row in tmpBanned[["subreddit", "author", "emotionAvg",  "body",]].values:
    if row[-1] == "[deleted]" or row[-1] == "[removed]":
        continue
    print ("%-20s"%row[0], "%-20s"%row[1], "%4.2f"%row[2], "  %-100s"%repr(row[-1][:100]))

beertrade            TimDisaster          5.97   "I haven't. Still trying to get someone to commit. \n\nWhere are you located?"                      
fakeid               meowmixID             nan   'im 21now thooo'                                                                                    
DarkNetMarkets       [deleted]            7.50   'Doms great'                                                                                        
fakeid               thatguyuno           6.30   'Has anyone been able to get in contact with /u/metsfan191 in the last few days?'                   
fatpeoplehate        Milhouse_is_a_meme   5.04   'Yeah. She calls herself turtle or some shit. '                                                     
fatpeoplehate        strawberrycircus     6.40   'I worked at one on a college campus once and we sometimes ran out of whipped cream. I once got yelle'
fakeid               WolfID               5.16   'Aight, aight just chill lol'                    