In [19]:
import numpy as np
import pandas as pd
import re 

sep = chr(0x1f) # We use this weird separator so we don't have any problems with any comment body

def save_datasets(post_df, com_df, name, index=False, header=True, sep=sep):
    post_df.to_csv(f'../client/files/posts_{name}.csv', index=index, header=header, sep=sep)
    com_df.to_csv(f'../client/files/comments_{name}.csv', index=index, header=header, sep=sep)


def read_datasets(name, sep=chr(0x1f)):
    post = pd.read_csv(f'../client/files/posts_{name}.csv', sep=sep)
    comment = pd.read_csv(f'../client/files/comments_{name}.csv', sep=sep)
    return post, comment

In [20]:
posts, comments = read_datasets("small")

In [21]:
print(posts.shape)
posts.head(3)

(550000, 12)


Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score
0,post,dx88v,2s5ti,meirl,False,1288195710,https://old.reddit.com/r/meirl/comments/dx88v/...,reddit.com,,[deleted],"Hey reddit, so I had this idea for an app that...",2
1,post,1258am,2vegg,me_irl,False,1351287079,https://old.reddit.com/r/me_irl/comments/1258a...,i.imgur.com,http://i.imgur.com/Df6K0.gif,,me irl,10
2,post,1258bl,2vegg,me_irl,False,1351287106,https://old.reddit.com/r/me_irl/comments/1258b...,i.imgur.com,http://i.imgur.com/0KNKS.gif,,me irl,10


In [22]:
# Sorting by create UTC, so we can split
comments_sorted = comments.sort_values('created_utc', axis=0, ascending=True)
posts_sorted = posts.sort_values('created_utc', axis=0, ascending=True)

In [23]:
posts_start = 0
comments_start = 0
comments_size = 200000000000
posts_size = 550000000000
posts_end = posts_start + posts_size
comments_end = comments_start + comments_size

comments_target = comments_sorted[comments_start:comments_end].copy()
posts_target = posts_sorted[posts_start:posts_end].copy()
print(posts_target.shape)
print(comments_target.shape)

(550000, 12)
(2000000, 10)


In [24]:
print(posts_target.shape)
print(comments_target.shape)

(550000, 12)
(2000000, 10)


In [25]:
#save_datasets(posts_target, comments_target, "full")

In [26]:
# Filter out not used columns
comment_columns = ['permalink', 'sentiment', 'body']
posts_columns = ['id', 'url', 'score']
comments_target = comments_target[comment_columns]
posts_target = posts_target[posts_columns]
print(posts_target.shape)
print(comments_target.shape)

(550000, 3)
(2000000, 3)


In [27]:
# Create filter functions for both dataframes

post_id_reg = re.compile('^[aA-zZ0-9]+$') # Allow any alphanumeric character
meme_url_reg = re.compile(r"((https)|(http))?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
post_score_reg = re.compile("^-?[0-9]+$") # Allow whole numbers

def matches(regex, value):
    return regex.match(value) is not None

def filter_posts(row):
    id = str(row['id'])
    url = str(row['url']).lower()
    score = str(row['score'])
    
    ok = matches(post_id_reg, id)
    ok &= matches(meme_url_reg, url)
    ok &= matches(post_score_reg, score)

    return ok


sentiment_r = re.compile(r'^[+-]?(1(\.0+)?|(0\.[0-9]+))$')
body_r = re.compile(r"(.*)?\S+(.*)?") # Allow any space
permalink_r = re.compile(r'https://old\.reddit\.com/r/((\bme_irl\b)|(\bmeirl\b))/comments/([^/]+)/.*')

def filter_comments(row):
    sentiment = str(row['sentiment'])
    body = str(row['body'])
    permalink = str(row['permalink'])
    
    ok = matches(sentiment_r, sentiment)
    ok &= matches(permalink_r, permalink)
    ok &= matches(body_r, body)
    ok &= (body != "[deleted]") and (body != "[removed]")

    return ok

In [28]:
posts_target.shape

(550000, 3)

In [29]:
posts_target = posts_target[posts_target.apply(filter_posts, axis=1)]
comments_target = comments_target[comments_target.apply(filter_comments, axis=1)]
print(posts_target.shape)
print(comments_target.shape)

(345697, 3)
(1168247, 3)


In [30]:
# Extract post_id from comment permalink
extract_regex = r'.+/comments/([^/]+)/.*'
comments_target['post_id'] = comments_target.permalink.str.extract(extract_regex, expand=False)
# Filter out not used columns
comment_columns = ['post_id', 'sentiment', 'body']
comments_target = comments_target[comment_columns]

print(comments_target.shape)
comments_target.head(3)

(1168247, 3)


Unnamed: 0,post_id,sentiment,body
7,127ptk,0.3612,I like your secondary amine
8,12836e,0.0,not enough lemon
11,12957s,0.3182,original mount &amp; blade iirc (i didn't take...


In [31]:
print("Posts:\n")
print(posts_target.isnull().sum())
print("\nComments:\n")
print(comments_target.isnull().sum())

Posts:

id       0
url      0
score    0
dtype: int64

Comments:

post_id      0
sentiment    0
body         0
dtype: int64


In [32]:
# Post score average
posts_mean_score = posts_target['score'].mean()
print(posts_mean_score)

177.732453564827


In [33]:
# Join the results by post_id
comments_with_posts = posts_target.set_index('id').join(comments_target.set_index('post_id'), how='inner')
print(comments_with_posts.shape)
comments_with_posts.head(3)

(656377, 4)


Unnamed: 0,url,score,sentiment,body
127ptk,http://i.imgur.com/GZMNT.jpg,14,0.3612,I like your secondary amine
12836e,http://i.imgur.com/Ng3QT.jpg,11,0.0,not enough lemon
12957s,http://i.imgur.com/nj1In.jpg,21,0.3182,original mount &amp; blade iirc (i didn't take...


In [34]:
student_comments = comments_with_posts[(comments_with_posts['body'].str.contains('university', flags=re.IGNORECASE, regex=True)) |
                                     (comments_with_posts['body'].str.contains('college', flags=re.IGNORECASE, regex=True)) | 
                                     (comments_with_posts['body'].str.contains('student', flags=re.IGNORECASE, regex=True)) | 
                                     (comments_with_posts['body'].str.contains('teacher', flags=re.IGNORECASE, regex=True)) | 
                                     (comments_with_posts['body'].str.contains('professor', flags=re.IGNORECASE, regex=True))]
print(student_comments.shape)
student_comments.head(3)

(3466, 4)


Unnamed: 0,url,score,sentiment,body
1weq9d,http://i.imgur.com/uXjyWyK.jpg,76,0.0,nahh man Nobody schools Professor Pokemon.
21lrei,http://i.imgur.com/7dlVW3X.jpg,12,0.0,I'm that one student who asks whatever questio...
2cl6nn,http://37.media.tumblr.com/ef7a2cf29a80da0c1d1...,57,0.0,You know why I find this pic iconic? I had to ...


In [38]:
student_comments_best_urls = student_comments[student_comments['score'] > posts_mean_score][['url']].drop_duplicates()
print(student_comments_best_urls.shape)
student_comments_best_urls.head(30)

(1536, 1)


Unnamed: 0,url
2h2xcc,http://i.imgur.com/gAGoeVk.png
2hizbm,http://i.imgur.com/UUuKryW.jpg
2jp72s,http://33.media.tumblr.com/afaa3b156f65b4b817c...
2m914u,http://imgur.com/6RQotUc
2ngfiw,http://i.imgur.com/ZxIuWez.png
2o2qvl,http://i.imgur.com/81daK.jpg
2o8n27,http://i.imgur.com/4RoJB3s.jpg
2oehmo,http://i.imgur.com/tUPBgj3.jpg
2ozhd8,http://imgur.com/Fv2YAnW
2p13x7,https://i.imgur.com/VNjskfC.png


In [36]:
memes_with_sentiment = comments_with_posts[['url', 'sentiment']].groupby('url').mean().reset_index().dropna().sort_values('sentiment', ascending=False)
print(memes_with_sentiment.shape)
memes_with_sentiment

(104169, 2)


Unnamed: 0,url,sentiment
54947,http://imgur.com/OgHIPMK,0.9999
5471,http://i.imgur.com/54wPMWF.jpg,0.9998
101011,https://imgur.com/YjivBrg.jpg,0.9989
65898,http://imgur.com/hiWHE9n,0.9989
947,http://45.media.tumblr.com/eeb0bdd88a495c66d83...,0.9987
...,...,...
97807,https://i.reddituploads.com/92e37f72611444e4b9...,-0.9937
34710,http://i.imgur.com/ohCj6d0.png,-0.9942
62140,http://imgur.com/bC973iw,-0.9946
99813,https://i.reddituploads.com/ecc57e12f3424ce4bd...,-0.9979


# Results

### Full
- Posts: all (starts on 0)
- Comments: all (starts on 0)
- AVG: #
- Best Meme: #
- Students Memes: #

### Half
- Posts: 1600000 (starts on 0)
- Comments: 6000000 (starts on 0)
- AVG: 224.47
- Best Meme: https://i.redd.it/zyvq456cbwz01.png
- Students Memes: #

### Upper Half
- Posts: 1600000 (starts on 1600000)
- Comments: 6000000 (starts on 6000000)
- AVG: 390.69
- Best Meme: https://i.redd.it/2esiak87aje41.jpg or https://i.redd.it/5lpr2qy0hi731.png
- Students Memes: #

### Small
- Posts: 550000 (starts on 0)
- Comments: 2000000 (starts on 0)
- AVG: 177.73
- Best Meme: http://imgur.com/OgHIPMK  
- Students Memes: #

### Medium
- Posts: 1100000 (starts on 0)
- Comments: 4000000 (starts on 0)
- AVG: 211.90
- Best Meme: http://imgur.com/OgHIPMK or http://i.imgur.com/YdKNs64.jpg	
- Students Memes: #

### Big
- Posts: 2700000 (starts on 0)
- Comments: 10000000 (starts on 0)
- AVG: 260.77
- Best Meme: https://i.redd.it/zyvq456cbwz01.png or https://i.redd.it/5lpr2qy0hi731.png
- Students Memes: #