In [179]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw
from datetime import datetime

import credentials as creds

## import custom helper functions
import helpers as h

In [180]:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

In [181]:
CLIENT_ID = creds.client_id()
CLIENT_SECRET_KEY = creds.client_secret_key()


r = praw.Reddit(client_id = CLIENT_ID,
                client_secret = CLIENT_SECRET_KEY,
                user_agent = 'RedditorMatch')

print(r.read_only)

True


In [182]:
scraped_subreddits = ["mizzou", "umich"]

In [183]:
def find_similar(matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

In [184]:
def getDf(subreddit_name):
    filePath = "data/" + subreddit_name + ".csv"
    df = pd.read_csv(filePath, encoding = "ISO-8859-1")
    print("--- Retrieved", len(df), "corpuses/corpi(?) for", subreddit_name)
    return(df)

In [186]:
def findMatches(your_username, subreddit_name, redditInstance):
    corpus = []    
    corpusDf = getDf(subreddit_name)
    corpusDf["Comments"].apply(lambda row: corpus.append(str(row)))
    
    your_comments = h.getUserComments(your_username, redditInstance)
    your_comments = ".".join(your_comments)
    
    corpus.insert(0, your_comments)
    
    print("--- Creating Tfidf vector...")
    
    tf = TfidfVectorizer(analyzer = "word", 
                            ngram_range = (1, 3),
                            min_df = 0, 
                            stop_words = "english")
    
    print("--- Fitting the matrix...")
    matrix = tf.fit_transform(corpus)
    results = []
    
    for index, score in find_similar(matrix, 0):        
        index = index - 1 # because we prepended our comments onto the corpus, the index number was shifted by 1.
        user = corpusDf.iloc[index, 0]
        results.append(user)
        print("...")
        print("...")
        print("...")
        print("Score:", score, "| Username:", user)
        print("=========================================================")
        
    return(results)

In [187]:
matches = findMatches("ohai123456789", scraped_subreddits[0], r)

--- Retrieved 288 corpuses/corpi(?) for mizzou
------ Retrieved 276 comments for: ohai123456789
--- Creating Tfidf vector...
--- Fitting the matrix...
...
...
...
Score: 0.16165903374 | Username: PrancingPeach
...
...
...
Score: 0.155631681852 | Username: KCTigerGrad
...
...
...
Score: 0.150467276104 | Username: BrettGilpin
...
...
...
Score: 0.149966970369 | Username: mycarebeardontcare
...
...
...
Score: 0.149200442551 | Username: SexyMcBeast


In [29]:
def getRedditorInfo(redditor_name, r):
    user = r.redditor(redditor_name)
    commentList = user.comments.top(limit = 1000)
    
    subreddit = []
    comment = []
    created_utc = []
    score = []
    ups = []
    downs = []
    controversiality = []
    flair = []
    gilded = []
    over_18 = []
    link = []
    
    for c in commentList:
        subreddit.append(c.subreddit_name_prefixed)
        comment.append(h.cleanText(c.body))
        
        parsed_date = datetime.utcfromtimestamp(c.created_utc)
        year = parsed_date.year
        month = parsed_date.month
        day = parsed_date.day

        
        created_utc.append(parsed_date)
        score.append(c.score)
        ups.append(c.ups)
        downs.append(c.downs)
        controversiality.append(c.controversiality)
        flair.append(c.author_flair_text)
        gilded.append(c.gilded)
        over_18.append(c.over_18)
        link.append(c.link_permalink)
        
    df = pd.DataFrame(subreddit, columns = ["subreddit"])
    df["comment"] = comment
    df["created_utc"] = created_utc
    df["score"] = score
    df["ups"] = ups
    df["downs"] = downs
    df["controversiality"] = controversiality
    df["flair"] = flair
    df["gilded"] = gilded
    df["over_18"] = over_18
    df["link"] = link
    
    print("Retrieved", len(df), "comments for user:", redditor_name)
    return(df)

In [188]:
your_comments = getRedditorInfo("ohai123456789", r)
other_comments = getRedditorInfo("PrancingPeach", r)

Retrieved 276 comments for user: ohai123456789
Retrieved 1000 comments for user: PrancingPeach


In [72]:
def commonSubreddits(df1, df2):
    df1 = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)
            
    df2 = df2.groupby(["subreddit"])[['comment']]\
            .count().reset_index()\
            .sort_values(["comment"], ascending = False)
            
    df1 = df1.merge(df2, on = "subreddit", how = "inner")["subreddit"]
    result = np.array(df1)
    
    return(result)

In [79]:
def commonSubredditCounts(df1, df2):    
    
    common = commonSubreddits(df1, df2)        
    
    df1 = df1[df1["subreddit"].isin(common)]
    df2 = df2[df2["subreddit"].isin(common)]
        
    df1Counts = df1.groupby(["subreddit"])[['comment']]\
                .count().reset_index()\
                .sort_values(["comment"], ascending = False)\
                .reset_index(drop = True)
                
    df2Counts = df2.groupby(["subreddit"])[['comment']]\
            .count().reset_index()\
            .sort_values(["comment"], ascending = False)\
            .reset_index(drop = True)              
                
    return(df1Counts, df2Counts)                              

In [80]:
one, two = commonSubredditCounts(your_comments, other_comments)

In [83]:
one.head(10)

Unnamed: 0,subreddit,comment
0,r/AskReddit,316
1,r/gaming,63
2,r/IAmA,29
3,r/movies,27
4,r/todayilearned,17
5,r/funny,14
6,r/videos,14
7,r/pics,14
8,r/worldnews,14
9,r/politics,13


In [84]:
two.head(10)

Unnamed: 0,subreddit,comment
0,r/gaming,63
1,r/todayilearned,62
2,r/Games,59
3,r/personalfinance,49
4,r/AskReddit,38
5,r/programming,28
6,r/worldnews,27
7,r/pics,17
8,r/self,15
9,r/funny,13


In [173]:
def getFlairs(df):
    flairs = set(np.unique(np.array((df[(df["flair"].isnull() == False) & (df["flair"] != "")]["flair"]))))
    return(flairs)

In [174]:
getFlairs(your_comments)

{'Arizona Wildcats', 'Arsenal'}

In [175]:
getFlairs(other_comments)

{'1∆'}

In [206]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [207]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [218]:
def getMatchedFeatures(your_username, other_username, redditInstance):
    
    usernames = []
    comments = []
    
    usernames.append(your_username)
    your_comments = h.getUserComments(your_username, redditInstance)
    your_comments = ".".join(your_comments)
    comments.append(your_comments)
    
    
    usernames.append(other_username)
    other_comments = h.getUserComments(other_username, redditInstance)
    other_comments = ".".join(other_comments)
    comments.append(other_comments)
    
    tf = TfidfVectorizer(analyzer = "word", 
                        ngram_range = (1, 3),
                        min_df = 0, 
                        stop_words = "english")  
    
    matrix = tf.fit_transform(comments)
    features = tf.get_feature_names()
    
    yourTopFeatures = top_feats_in_doc(matrix, features, 0).sort_values("tfidf", ascending = False)
    otherTopFeatures = top_feats_in_doc(matrix, features, 1).sort_values("tfidf", ascending = False)
        
    return(yourTopFeatures, otherTopFeatures)

In [219]:
yourTopFeatures, otherTopFeatures = getMatchedFeatures("wingzeromkii", "PrancingPeach", r)

------ Retrieved 1079 comments for: wingzeromkii
------ Retrieved 1931 comments for: PrancingPeach


In [222]:
yourTopFeatures.head(50)

Unnamed: 0,feature,tfidf
0,like,0.262544
1,just,0.253792
2,im,0.208285
3,pretty,0.190782
4,time,0.187281
5,game,0.183781
6,think,0.141774
7,dont,0.131272
8,people,0.126021
9,really,0.115519


In [223]:
otherTopFeatures.head(50)

Unnamed: 0,feature,tfidf
0,just,0.360814
1,like,0.244423
2,think,0.241737
3,people,0.239498
4,dont,0.236812
5,really,0.188465
6,im,0.16653
7,game,0.155338
8,youre,0.149519
9,actually,0.145937


In [39]:
def whyMatch(you, other, r):       
    # find top 10 subreddits
    # word cloud the comments (overall)
    # word cloud the comments (common subreddits)
    # flairs for common subreddits
        
    return

In [None]:
# sentiment analysis for comments on a particular subreddit over time

In [None]:
# classification of flairs

In [None]:
# why makes a comment get a lot of upvotes?