In [178]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw
from datetime import datetime
from nltk.corpus import stopwords
from sklearn.feature_extraction import text 
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import credentials as creds

## import custom helper functions
import helpers as h

In [22]:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

In [23]:
CLIENT_ID = creds.client_id()
CLIENT_SECRET_KEY = creds.client_secret_key()


r = praw.Reddit(client_id = CLIENT_ID,
                client_secret = CLIENT_SECRET_KEY,
                user_agent = 'RedditorMatch')

print(r.read_only)

True


In [24]:
scraped_subreddits = ["mizzou_copy", "umich"]

In [25]:
def find_similar(matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [26]:
def getDf(subreddit_name):
    filePath = "data/" + subreddit_name + ".csv"
    df = pd.read_csv(filePath, encoding = "ISO-8859-1")
    print("--- Retrieved", len(df), "corpuses/corpi(?) for", subreddit_name)
    return(df)

In [27]:
def findMatches(your_username, subreddit_name, redditInstance):
    corpus = []    
    corpusDf = getDf(subreddit_name)
    corpusDf["Comments"].apply(lambda row: corpus.append(str(row)))
    
    your_comments = h.getUserComments(your_username, redditInstance)
    your_comments = ".".join(your_comments)
    
    corpus.insert(0, your_comments)
    
    print("--- Creating Tfidf vector...")
    
    myStopWords = stopwords.words('english')
    myStopWords = text.ENGLISH_STOP_WORDS.union(myStopWords)
    
    tf = TfidfVectorizer(analyzer = "word", 
                            ngram_range = (1, 3),
                            min_df = 0, 
                            stop_words = myStopWords)
    
    print("--- Fitting the matrix...")
    matrix = tf.fit_transform(corpus)
    results = []
    
    for index, score in find_similar(matrix, 0):        
        index = index - 1 # because we prepended our comments onto the corpus, the index number was shifted by 1.
        user = corpusDf.iloc[index, 0]
        results.append(user)
        print("...")
        print("...")
        print("...")
        print("Score:", score, "| Username:", user)
        print("=========================================================")
        
    return(results)

In [28]:
matches = findMatches("ohai123456789", scraped_subreddits[0], r)

--- Retrieved 76 corpuses/corpi(?) for mizzou_copy
------ Retrieved 276 comments for: ohai123456789
--- Creating Tfidf vector...
--- Fitting the matrix...
...
...
...
Score: 0.206017355728 | Username: PrancingPeach
...
...
...
Score: 0.203337739501 | Username: KCTigerGrad
...
...
...
Score: 0.194409181535 | Username: BrettGilpin
...
...
...
Score: 0.184484603953 | Username: J_Tuck
...
...
...
Score: 0.173632401955 | Username: danwin


In [29]:
def getRedditorInfo(redditor_name, r):
    user = r.redditor(redditor_name)
    commentList = user.comments.top(limit = 1000)
    
    subreddit = []
    comment = []
    created_utc = []
    score = []
    ups = []
    downs = []
    controversiality = []
    flair = []
    gilded = []
    over_18 = []
    link = []
    
    for c in commentList:
        subreddit.append(c.subreddit_name_prefixed)
        comment.append(h.cleanText(c.body))
        
        parsed_date = datetime.utcfromtimestamp(c.created_utc)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day

        
        created_utc.append(parsed_date)
        score.append(c.score)
        ups.append(c.ups)
        downs.append(c.downs)
        controversiality.append(c.controversiality)
        flair.append(c.author_flair_text)
        gilded.append(c.gilded)
        over_18.append(c.over_18)
        link.append(c.link_permalink)
        
    df = pd.DataFrame(subreddit, columns = ["subreddit"])
    df["comment"] = comment
    df["created_utc"] = created_utc
    df["score"] = score
    df["ups"] = ups
    df["downs"] = downs
    df["controversiality"] = controversiality
    df["flair"] = flair
    df["gilded"] = gilded
    df["over_18"] = over_18
    df["link"] = link
    
    print("Retrieved", len(df), "comments for user:", redditor_name)
    return(df)

In [136]:
your_comments = getRedditorInfo("KCTigerGrad", r)
other_comments = getRedditorInfo("PrancingPeach", r)

Retrieved 996 comments for user: KCTigerGrad
Retrieved 1000 comments for user: PrancingPeach


In [137]:
your_comments.head(10)

Unnamed: 0,subreddit,comment,created_utc,score,ups,downs,controversiality,flair,gilded,over_18,link
0,r/blackladies,Whats frustrating about the notion of white fo...,2017-11-08 22:29:14,93,93,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...
1,r/TwoXChromosomes,Of course my home state would propose somethin...,2016-11-17 18:43:22,86,86,0,0,,0,False,https://www.reddit.com/r/TwoXChromosomes/comme...
2,r/AskReddit,He came to my campus and did the same thing,2017-08-14 02:30:37,65,65,0,0,,0,True,https://www.reddit.com/r/AskReddit/comments/6t...
3,r/relationships,If shes blocked you Im thinking she knows abou...,2016-10-06 04:25:13,57,57,0,0,,0,False,https://www.reddit.com/r/relationships/comment...
4,r/blackladies,I didnt actually see this persons comment but ...,2016-11-13 21:38:11,38,38,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...
5,r/TwoXChromosomes,People banging on about how bad Trump is real...,2016-11-17 21:17:32,39,39,0,0,,0,False,https://www.reddit.com/r/TwoXChromosomes/comme...
6,r/blackladies,Couldnt agree more Clearly they white women si...,2016-11-13 21:17:19,35,35,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...
7,r/jobs,Im so sorry this is happening to you K ow its ...,2017-01-02 02:54:50,32,32,0,0,,0,False,https://www.reddit.com/r/jobs/comments/5lhuge/...
8,r/kansascity,facepalm Thanks,2016-07-19 16:17:59,28,28,0,0,,0,False,https://www.reddit.com/r/kansascity/comments/4...
9,r/blackladies,The questions in the interview were very leadi...,2015-08-17 02:35:53,27,27,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...


In [138]:
def commonSubreddits(df1, df2):
    df1 = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)
            
    df2 = df2.groupby(["subreddit"])[['comment']]\
            .count().reset_index()\
            .sort_values(["comment"], ascending = False)
            
    df1 = df1.merge(df2, on = "subreddit", how = "inner")["subreddit"]
    result = np.array(df1)
    
    return(result)

In [139]:
def commonSubredditCounts(df1, df2):    
    
    common = commonSubreddits(df1, df2)        
    
    df1 = df1[df1["subreddit"].isin(common)]
    df2 = df2[df2["subreddit"].isin(common)]
        
    df1Counts = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)\
                .reset_index(drop = True)
                
    df2Counts = df2.groupby(["subreddit"])[['comment']]\
            .count().reset_index()\
            .sort_values(["comment"], ascending = False)\
            .reset_index(drop = True)              
                
    return(df1Counts, df2Counts)                              

In [140]:
one, two = commonSubredditCounts(your_comments, other_comments)

In [141]:
one.head(10)

Unnamed: 0,subreddit,comment
0,r/mizzou,207
1,r/relationships,59
2,r/AskReddit,55
3,r/personalfinance,22
4,r/IAmA,21
5,r/jobs,16
6,r/politics,14
7,r/movies,5
8,r/explainlikeimfive,1
9,r/television,1


In [142]:
two.head(10)

Unnamed: 0,subreddit,comment
0,r/personalfinance,49
1,r/AskReddit,38
2,r/relationships,28
3,r/jobs,22
4,r/movies,10
5,r/IAmA,8
6,r/politics,4
7,r/mizzou,2
8,r/explainlikeimfive,1
9,r/television,1


In [143]:
def getFlairs(df):
    flairs = set(np.unique(np.array((df[(df["flair"].isnull() == False) & (df["flair"] != "")]["flair"]))))
    return(flairs)

In [144]:
getFlairs(your_comments)

{'24 F 5\'5" | SW: 272 CW: 235.4 GW: 150', 'Social Media Strategist'}

In [145]:
getFlairs(other_comments)

{'1∆'}

In [146]:
def top_tfidf_feats(row, features, top_n = 25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [147]:
def top_feats_in_doc(Xtr, features, row_id, top_n = 25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [148]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [149]:
def getTopFeatures(your_username, redditInstance):
    
    your_corpus = h.getUserComments(your_username, redditInstance)    
    
    myStopWords = stopwords.words('english')
    myStopWords = text.ENGLISH_STOP_WORDS.union(myStopWords)
    
    tf1 = TfidfVectorizer(analyzer = "word", 
                        ngram_range = (2, 4),
                        min_df = 0, 
                        stop_words = myStopWords)  
    
    matrix1 = tf1.fit_transform(your_corpus)
    features1 = tf1.get_feature_names()         
    yourTopOverallFeatures = top_mean_feats(matrix1, features1)
        
    return(yourTopOverallFeatures)

In [153]:
top = getTopFeatures("KCTigerGrad", r)
top

------ Retrieved 983 comments for: KCTigerGrad


Unnamed: 0,feature,tfidf
0,awesome thanks,0.003323
1,good idea,0.002156
2,makes sense,0.001886
3,super helpful,0.001785
4,helpful thank,0.001607
5,black people,0.001569
6,ill check,0.001484
7,glad im,0.001417
8,awesome thank,0.001339
9,cool thanks,0.001214


In [155]:
your_comments.head(5)

Unnamed: 0,subreddit,comment,created_utc,score,ups,downs,controversiality,flair,gilded,over_18,link
0,r/blackladies,Whats frustrating about the notion of white fo...,2017-11-08 22:29:14,93,93,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...
1,r/TwoXChromosomes,Of course my home state would propose somethin...,2016-11-17 18:43:22,86,86,0,0,,0,False,https://www.reddit.com/r/TwoXChromosomes/comme...
2,r/AskReddit,He came to my campus and did the same thing,2017-08-14 02:30:37,65,65,0,0,,0,True,https://www.reddit.com/r/AskReddit/comments/6t...
3,r/relationships,If shes blocked you Im thinking she knows abou...,2016-10-06 04:25:13,57,57,0,0,,0,False,https://www.reddit.com/r/relationships/comment...
4,r/blackladies,I didnt actually see this persons comment but ...,2016-11-13 21:38:11,38,38,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...


In [163]:
def getSubredditsPosted(comments):
    subs = np.array(comments["subreddit"])
    subs = (np.unique(subs))
    return(subs)

In [164]:
getSubredditsPosted(your_comments)

array(['r/90DayFiance', 'r/AskHR', 'r/AskReddit', 'r/BlackHair',
       'r/Blackfellas', 'r/Blogging', 'r/Candles', 'r/Cardinals',
       'r/CelebrityPenis', 'r/FashionPlus', 'r/Feminism',
       'r/FrugalFemaleFashion', 'r/Hulu', 'r/IAmA', 'r/IKEA',
       'r/Long_Distance', 'r/NashvilleTN', 'r/Naturalhair', 'r/PlusSize',
       'r/StLouis', 'r/StudentLoans', 'r/TwoXChromosomes',
       'r/WriterMotivation', 'r/advertising', 'r/blackladies',
       'r/blackpower', 'r/brownbeauty', 'r/chicago', 'r/columbiamo',
       'r/depression', 'r/explainlikeimfive', 'r/findfashion',
       'r/freelance', 'r/freelanceWriters', 'r/freelance_forhire',
       'r/incest_relationships', 'r/indianapolis', 'r/indyjobs', 'r/jobs',
       'r/kansascity', 'r/loseit', 'r/mentalhealth', 'r/mizzou',
       'r/movies', 'r/nanoafternano', 'r/nanowrimo', 'r/nashville',
       'r/offmychest', 'r/personalfinance', 'r/politics', 'r/publishing',
       'r/racism', 'r/relationships', 'r/sex', 'r/television',
       'r

In [165]:
getSubredditsPosted(other_comments)

array(['r/AdviceAnimals', 'r/Android', 'r/AsABlackMan', 'r/AskFeminists',
       'r/AskReddit', 'r/BasicIncome', 'r/DarkSouls2', 'r/Diablo',
       'r/EnoughTrumpSpam', 'r/Fitness', 'r/Games', 'r/IAmA',
       'r/JusticePorn', 'r/MachineLearning', 'r/MorbidReality',
       'r/NintendoSwitch', 'r/OopsDidntMeanTo', 'r/Showerthoughts',
       'r/Steam', 'r/The_Donald', 'r/TopMindsOfReddit', 'r/Vive',
       'r/Watches', 'r/apple', 'r/appletv', 'r/atheism', 'r/aww',
       'r/bloodborne', 'r/cars', 'r/changemyview', 'r/childfree',
       'r/creepy', 'r/cringe', 'r/cringepics', 'r/cscareerquestions',
       'r/darksouls', 'r/dataisbeautiful', 'r/demonssouls',
       'r/explainlikeimfive', 'r/forwardsfromgrandma', 'r/funny',
       'r/gameofthrones', 'r/gaming', 'r/gonewild', 'r/haskell',
       'r/houston', 'r/iamverybadass', 'r/iamverysmart', 'r/instantkarma',
       'r/interstellar', 'r/jobs', 'r/math', 'r/mizzou', 'r/moto360',
       'r/movies', 'r/news', 'r/nintendo', 'r/oculus', 'r/pat

In [169]:
your_comments.head()

Unnamed: 0,subreddit,comment,created_utc,score,ups,downs,controversiality,flair,gilded,over_18,link
0,r/blackladies,Whats frustrating about the notion of white fo...,2017-11-08 22:29:14,93,93,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...
1,r/TwoXChromosomes,Of course my home state would propose somethin...,2016-11-17 18:43:22,86,86,0,0,,0,False,https://www.reddit.com/r/TwoXChromosomes/comme...
2,r/AskReddit,He came to my campus and did the same thing,2017-08-14 02:30:37,65,65,0,0,,0,True,https://www.reddit.com/r/AskReddit/comments/6t...
3,r/relationships,If shes blocked you Im thinking she knows abou...,2016-10-06 04:25:13,57,57,0,0,,0,False,https://www.reddit.com/r/relationships/comment...
4,r/blackladies,I didnt actually see this persons comment but ...,2016-11-13 21:38:11,38,38,0,0,,0,False,https://www.reddit.com/r/blackladies/comments/...


In [212]:
def getSentiment(username, r):
    comments = getRedditorInfo(username, r)
                    
    if (len(comments) < 1):
        print("No comments for that user")
        return None
                        
    comments = comments.sort_values("created_utc", ascending = True)    
    sid = SentimentIntensityAnalyzer()
    
    comments["negative"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["neg"]) 
    comments["neutral"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["neu"]) 
    comments["positive"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["pos"]) 
    comments["compound"] = comments["comment"].apply(lambda x: sid.polarity_scores(x)["compound"]) 
    
    return(comments)    

In [213]:
x = getSentiment("ohai123456789", r)
x.head()

Retrieved 276 comments for user: ohai123456789


Unnamed: 0,subreddit,comment,created_utc,score,ups,downs,controversiality,flair,gilded,over_18,link,negative,neutral,positive,compound
260,r/AskReddit,Love to hear some stories,2012-08-21 02:09:18,1,1,0,0,,0,False,https://www.reddit.com/r/AskReddit/comments/yk...,0.0,0.488,0.512,0.6369
259,r/AskReddit,Yep im putting the mark at 100k salary Its bee...,2012-08-21 03:41:44,1,1,0,0,,0,False,https://www.reddit.com/r/AskReddit/comments/yk...,0.0,0.711,0.289,0.9398
99,r/AskReddit,Haha idiot here with the massive student loans...,2012-08-21 03:46:33,2,2,0,0,,0,False,https://www.reddit.com/r/AskReddit/comments/yk...,0.106,0.721,0.173,0.6145
98,r/consulting,Anybody,2012-08-21 03:48:08,2,2,0,0,,0,False,https://www.reddit.com/r/consulting/comments/y...,0.0,1.0,0.0,0.0
258,r/consulting,They do share similar frameworks as management...,2012-08-21 17:05:45,1,1,0,0,,0,False,https://www.reddit.com/r/consulting/comments/y...,0.04,0.852,0.107,0.6945


In [None]:
# plot sentiment by subreddit (faceted)

In [None]:
# sentiment analysis for comments on a particular subreddit over time

In [None]:
# classification of flairs

In [None]:
# why makes a comment get a lot of upvotes?