In [14]:
import requests
import json
import re
import time
import pandas as pd

In [15]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return request.json()

In [16]:
data_type="comment"     # give me comments, use "submission" to publish something
query="inspiring|inspired|inspire|uplifted|uplifting|uplift"          # Add your query
# duration="30d"          # Select the timeframe. Epoch value or Integer + "s,m,h,d" (i.e. "second", "minute", "hour", "day")
size=1000               # maximum 1000 comments
sort_type="score"       # Sort by score (Accepted: "score", "num_comments", "created_utc")
sort="desc"             # sort descending
aggs="subreddit"        #"author", "link_id", "created_utc", "subreddit"


In [17]:
data = get_pushshift_data(data_type=data_type,
                          q=query,
                          size=size,
                          aggs=aggs)

In [18]:
data = data["data"]

In [19]:
import pandas as pd
column_values = ['body','parent_id','subreddit_id', 'score', 'subreddit']
df_comments = pd.DataFrame.from_records(data)#[0:10]
df_comments= pd.DataFrame(df_comments, columns=column_values)
df_comments = df_comments.rename(columns={'body': "comment_body",'parent_id': "id", "score":"score_com"})
df_comments

Unnamed: 0,comment_body,id,subreddit_id,score_com,subreddit
0,The movie car was the 4th generation of Imprez...,t1_hbt3zn5,t5_2xrd1,1,gtaonline
1,"Enjoy anime, manga, or KPOP?\n\nHow about cand...",t3_pipwpa,t5_2qr34,1,smallbusiness
2,They'd just gotten back from seeing *Friends w...,t1_hbtw0fb,t5_3aimx,1,IdiotsInCars
3,"&gt; half ass-setup\n\n***\n\n^(Bleep-bloop, I...",t1_hbuh482,t5_scu5z,1,fo76FilthyCasuals
4,Funny and real fact. This show inspired me to ...,t3_pipmvc,t5_2sd8v,1,PersonOfInterest
...,...,...,...,...,...
95,"A lot of the earlier couples are wonderful, an...",t1_hbu8n7f,t5_3o6nf,1,90dayfianceuncensored
96,Connections: Evil Time Traveling Doppelgängers...,t3_pj6qn2,t5_2sj683,1,DeathBattleMatchups
97,"At first, I thought it was going to be a rage ...",t3_pj3mt5,t5_3c048,1,ConanExiles
98,"A short story heavily inspired by ""The Pedestr...",t3_pj6ltg,t5_2qh1i,1,AskReddit


In [23]:
def fetchObjects(**kwargs):
    # Default paramaters for API query
    params = {
        "sort_type":"created_utc",
        "sort":"asc",
        "size":10000,
        "query":"inspiring|inspired|inspire|uplifted|uplifting",
        }

    # Add additional paramters based on function arguments
    for key,value in kwargs.items():
        params[key] = value

    # Perform an API request
    base_url = f"https://api.pushshift.io/reddit/search/comment/?q={query}"
    r = requests.get(base_url, params)


    # Check the status code, if successful, process the data
    if r.status_code == 200:
        response = json.loads(r.text)
        data = response['data']
        sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
        return sorted_data_by_id


In [26]:
def extract_reddit_data(**kwargs):
    # Speficify the start timestamp
    max_created_utc = 1356998400  # 01/01/2013 @ 12:00am (UTC)
    max_id = 0
    
    for key,value in kwargs.items():
        if key == "columns":
            column_values = value
    
    df_all=pd.DataFrame(columns=column_values)
    while 1:
        nothing_processed = True
        # Call the recursive function
        objects = fetchObjects(**kwargs,after=max_created_utc)
        df_new = pd.DataFrame(objects, columns = column_values)
        df_all = pd.concat([df_all, df_new])
#         if len(df_all) == 50000: #for more data
        if len(df_all) == 1000:
            return df_all.reset_index(drop=True).sort_values(by=['score'], ascending=False)
        
        # Loop the returned data, ordered by date
        for object in objects:
            id = int(object['id'],36)
            if id > max_id:
                nothing_processed = False
                created_utc = object['created_utc']
                max_id = id
                if created_utc > max_created_utc: max_created_utc = created_utc

        # Exit if nothing happened
        if nothing_processed: return df_all.reset_index(drop=True).sort_values(by=['score'], ascending=False)
        max_created_utc -= 1

        # Sleep a little before the next recursive function call
        time.sleep(.5)

In [28]:
column_values = ['body','parent_id','subreddit_id', 'score']
df_comments = extract_reddit_data(columns = column_values)
df_comments

Unnamed: 0,body,parent_id,subreddit_id,score
149,this has inspired me to make an early birthday...,t3_15rb1k,t5_2qh33,397
37,You've just inspired my newest mission: get be...,t1_c7oxgnx,t5_2qh1i,227
883,D- Demonstrate Value \nE- Engage Physically \n...,t1_c7pmkea,t5_2qh33,204
349,&gt; I don't understand what your objection to...,t1_c7owury,t5_2qh13,74
175,I honestly felt a little...uncomfortable when ...,t1_c7p3ith,t5_2qh0u,71
...,...,...,...,...
487,"hmmm, i don't know, it does kind of sound like...",t1_c7p8c6s,t5_2rske,-4
494,"I *also* cook 95% of my food from scratch, and...",t1_c7pceo4,t5_2qhpm,-5
220,"I'm sorry, but what nonsense. \n\nScience is a...",t3_15rbot,t5_2qh2p,-6
498,You have to consider where they're coming from...,t1_c7p92lf,t5_2qh1i,-7


In [29]:
new_parent_id = [] #rename ids from parent_id column
for id in list(df_comments.parent_id):
    new_parent_id.append(id.split("_")[1])
df_comments.parent_id = new_parent_id
df_comments = df_comments.rename(columns={'body': "comment_body",'parent_id': "id", "score":"score_com"})
df_comments


Unnamed: 0,comment_body,id,subreddit_id,score_com
149,this has inspired me to make an early birthday...,15rb1k,t5_2qh33,397
37,You've just inspired my newest mission: get be...,c7oxgnx,t5_2qh1i,227
883,D- Demonstrate Value \nE- Engage Physically \n...,c7pmkea,t5_2qh33,204
349,&gt; I don't understand what your objection to...,c7owury,t5_2qh13,74
175,I honestly felt a little...uncomfortable when ...,c7p3ith,t5_2qh0u,71
...,...,...,...,...
487,"hmmm, i don't know, it does kind of sound like...",c7p8c6s,t5_2rske,-4
494,"I *also* cook 95% of my food from scratch, and...",c7pceo4,t5_2qhpm,-5
220,"I'm sorry, but what nonsense. \n\nScience is a...",15rbot,t5_2qh2p,-6
498,You have to consider where they're coming from...,c7p92lf,t5_2qh1i,-7


In [40]:
import requests
# max 500 ids per request, need to call multiple times
max_n = 500
list_submission_ids = list(set(list(df_comments["id"])))

list_submission_ids_split = [list_submission_ids[i:i + max_n] for i in range(0, len(list_submission_ids), max_n)]  

df_submissions = pd.DataFrame(columns = column_values)
for list_submission_ids in list_submission_ids_split:
    
    print(len(list_submission_ids))

    str_list_ids = ",".join(list_submission_ids)

    url = f"https://api.pushshift.io/reddit/search/submission/?ids=" + str_list_ids 
    request = requests.get(url)
    try:
        json_response = request.json()
        data=json_response["data"]
        print(len(data))
        df_submissions_part = pd.DataFrame.from_records(data)
        column_values = ['title', 'score', 'selftext', 'url', 'id', "subreddit"]
        df_submissions_part = pd.DataFrame(df_submissions_part, columns = column_values)
        df_submissions = pd.concat([df_submissions, df_submissions_part])
    except Exception as e:
        print("Exception: " + str(e))
    
df_submissions = df_submissions.reset_index(drop=True).sort_values(by=['score'], ascending=False)
df_submissions

500
53
372
53


Unnamed: 0,title,score,selftext,url,id,subreddit
60,Deep fried hot dog stars with cheese sauce.,2026,,http://imgur.com/a/VObuF,14v5iw,food
97,"DJ Shadow kicked off decks in Miami - ""I don't...",1861,,https://www.youtube.com/watch?v=g3sO0Se-NT0,15q2g1,Music
30,Can a group of KSP fans but an inexperienced p...,1793,,http://imgur.com/a/sW3Tz,15qf0e,KerbalSpaceProgram
77,What is your favorite product that has been di...,1750,Simple question. What product did you love th...,http://www.reddit.com/r/AskReddit/comments/15q...,15qmfc,AskReddit
98,White kid singing the blues like it's nobody's...,1682,,http://youtu.be/F1pwnb0NpQw,15mmna,videos
...,...,...,...,...,...,...
44,How do games affect us?,0,The consensus on reddit seems to be that video...,http://www.reddit.com/r/truegaming/comments/15...,15onta,truegaming
76,If this redditor snaps... 11,0,,http://imgur.com/a/i0DVB,15qmm5,GunsAreCool
71,"Parents of disabled children, if you had the c...",0,,http://www.reddit.com/r/AskReddit/comments/15q...,15qwfn,AskReddit
66,[Meta]Open Dialogue with the Mods,0,Ok guys so here goes...\n\nAfter [this No. 1 p...,http://www.reddit.com/r/childfree/comments/15q...,15qtp8,childfree


In [41]:
idx = 1
df_submissions["title"][idx] + " | " + df_submissions["selftext"][idx] + " | "  + str(df_submissions["score"][idx]) + " | " + df_submissions["subreddit"][idx]

'What are your favorite blogs/tumblrs? \n | What are your favorite natural hair, life, fashion, cultural, food educational blogs? | 9 | blackladies'

In [42]:
merged = pd.merge(df_submissions, df_comments, on="id", how='inner')
column_values =["subreddit", "id", "comment_body", "score_com", "title", "selftext", "score"]
merged= pd.DataFrame(merged, columns=column_values)
merged

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
0,food,14v5iw,"Made them for Christmas, with the cheese sauce...",1,Deep fried hot dog stars with cheese sauce.,,2026
1,Music,15q2g1,Dj shadow is my favorite artist and a top-tier...,1,"DJ Shadow kicked off decks in Miami - ""I don't...",,1861
2,KerbalSpaceProgram,15qf0e,I'm inspired :D,4,Can a group of KSP fans but an inexperienced p...,,1793
3,AskReddit,15qmfc,I can't find evidence that these ever existed ...,1,What is your favorite product that has been di...,Simple question. What product did you love th...,1750
4,videos,15mmna,This video inspired/spoke to me thanks.,0,White kid singing the blues like it's nobody's...,,1682
...,...,...,...,...,...,...,...
106,truegaming,15onta,People take things away from games on an indiv...,2,How do games affect us?,The consensus on reddit seems to be that video...,0
107,GunsAreCool,15qmm5,That was my post...and after getting slammed f...,2,If this redditor snaps... 11,,0
108,AskReddit,15qwfn,I used to volunteer a lot with the special Oly...,1,"Parents of disabled children, if you had the c...",,0
109,childfree,15qtp8,I believe the minority hasn't spoken up here. ...,-2,[Meta]Open Dialogue with the Mods,Ok guys so here goes...\n\nAfter [this No. 1 p...,0


In [18]:
merged.transpose().to_json('data/comments_submission_all.json', index = 'true') 

## Clean data: 

#### Remove comments and posts that have negative scores

In [51]:
merged = merged[~(merged['score_com'] <= 0)]  
merged = merged[~(merged['score'] <= 0)]
merged

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
1,pics,7m28mc,It's these kind of posts that restore some of ...,1,She took me in 15 years ago and no one makes m...,,171795
3,pics,6f0lr7,I understand trying to uplift people with the ...,1,You can destroy a country. But you can't destr...,,129553
4,pics,808ddp,Hope you put this on r/uplift.,1,Wife is a kindergarten teacher. A couple of th...,,119527
5,todayilearned,6po7fz,"I wish we could do this on a global scale, upl...",2,TIL when a millionaire gave everyone in a Flor...,,117421
6,aww,7i2z6g,It’s the smallest of gestures that Can can gi...,1,Little boy just wants to hug the police officer,,104777
...,...,...,...,...,...,...,...
20484,AskReddit,30qhb0,"I saw my babysitter naked, once. And she used ...",1,Redditors who have had babysitters/hired babys...,,1
20485,TwoXChromosomes,2cqot7,"I am a feminist, but I'll tell you some flaws ...",1,Why aren't you a feminist?,,1
20486,AskMen,8uuymy,"In established sci fi terms this is called ""u...",10,If humans get the capability to create brains ...,Or should we not give the rational mind to any...,1
20487,askscience,2gn537,The answer to this is complicated and also a q...,5,Are all mountains formed by tectonic plate mov...,,1


1. remove links

In [58]:
import re

merged.dropna()

for index in merged.index:
    title = merged.loc[index,'title']
    title = re.sub(r'http\S+', '', title)
    title = title.replace("\n", "")
    title = re.sub('\s{2,}', ' ', title)  # To remove more than one space
    merged.loc[index,'title'] = title.rstrip() #remove new line

for index in merged.index:
    selftext = merged.loc[index,'selftext']
    selftext = re.sub(r'http\S+', '', selftext)
    selftext = selftext.replace("\n", "")
    selftext = re.sub('\s{2,}', ' ', selftext)  # To remove more than one space
    merged.loc[index,'selftext'] = selftext.rstrip() #remove new line

for index in merged.index:
    comment_body = merged.loc[index,'comment_body']
    comment_body = re.sub(r'http\S+', '', comment_body)
    comment_body = comment_body.replace("\n", "")
    comment_body = re.sub('\s{2,}', ' ', comment_body)  # To remove more than one space
    merged.loc[index,'comment_body'] = comment_body.rstrip() #remove new line


merged

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
1,pics,7m28mc,It's these kind of posts that restore some of ...,1,She took me in 15 years ago and no one makes m...,,171795
3,pics,6f0lr7,I understand trying to uplift people with the ...,1,You can destroy a country. But you can't destr...,,129553
4,pics,808ddp,Hope you put this on r/uplift.,1,Wife is a kindergarten teacher. A couple of th...,,119527
5,todayilearned,6po7fz,"I wish we could do this on a global scale, upl...",2,TIL when a millionaire gave everyone in a Flor...,,117421
6,aww,7i2z6g,It’s the smallest of gestures that Can can giv...,1,Little boy just wants to hug the police officer,,104777
...,...,...,...,...,...,...,...
20484,AskReddit,30qhb0,"I saw my babysitter naked, once. And she used ...",1,Redditors who have had babysitters/hired babys...,,1
20485,TwoXChromosomes,2cqot7,"I am a feminist, but I'll tell you some flaws ...",1,Why aren't you a feminist?,,1
20486,AskMen,8uuymy,"In established sci fi terms this is called ""u...",10,If humans get the capability to create brains ...,Or should we not give the rational mind to any...,1
20487,askscience,2gn537,The answer to this is complicated and also a q...,5,Are all mountains formed by tectonic plate mov...,,1


2. remove posts that contains string in comment: "Askreddit is for thought-provoking, discussion-inspiring questions. Askreddit is not your research source. If the answer can be googled, or adequately answered in one word, it’s not right for this subreddit. "

In [None]:
query = " is for thought-provoking, discussion-inspiring questions"
merged = merged[~merged.comment_body.str.contains(query)]
merged

### Look at top rated comments from certain subreddits - those asking questions

In [60]:
merged6 = merged[merged.subreddit.isin(["AskReddit", "IAmA"])]
merged6

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
7,AskReddit,8pks1u,The sympathy in this thread is comforting.The ...,3,Suicide Prevention Megathread,With the news today of the passing of the amaz...,104195
12,IAmA,80ow6w,Hello Bill! First thank you and your wife Meli...,1,"I’m Bill Gates, co-chair of the Bill &amp; Mel...",I’m excited to be back for my sixth AMA.Here’s...,94575
36,AskReddit,84e0nc,I never had a great relationship with my fathe...,2,"Daughters of reddit, what is something you wis...",,66465
78,AskReddit,8qbnls,"I'm hispanic and since Trump became President,...",1,Since Donald Trump has been President of the U...,,48541
82,AskReddit,7e8r3x,I used to work for one of those bargain stores...,2,"What is unethical as fuck, but is extremely co...",,47881
...,...,...,...,...,...,...,...
20459,AskReddit,5kojl7,I think I'd continue my giftOf using my verse ...,1,"If you died and became a ghost, who would you ...",,1
20460,AskReddit,4ll6k1,One man saved ten women from a blazing forest ...,2,"What's your most uplifting story, Reddit?",,1
20461,AskReddit,4ll6k1,One man saved ten women from a blazing forest ...,2,"What's your most uplifting story, Reddit?",,1
20473,AskReddit,7c5ii4,Convection currents / uplift.,1,How come gravity doesn't pull down clouds?,,1


In [51]:
merged6.transpose().to_json('data/comments_submission_ask.json', index = 'true') 

### Look at top rated comments from everywhere and filter by length

In [23]:
merged7 = merged[(merged['comment_body']).str.split().str.len().gt(10)]
merged7 = merged7[(merged7['comment_body']).str.split().str.len().lt(201)]
merged7 = merged7.sort_values(by=['score_com'], ascending=False)
merged7

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
2007,AskReddit,15twl7,1. Up1. Harold and Kumar1. Star Trek (2009)Aft...,2676,What three completely unrelated movies can you...,"EDIT: Front page, that is awesome! It is takin...",2100
894,worldnews,17xe4c,"Here's my translation: &gt;Last night, a truly...",1583,"North Korea, poised to conduct a nuclear test ...",,2895
1512,AskReddit,170tzo,"Bratz Dolls. They look anorexic, dress like ho...",1400,What is the creepiest thing that society accep...,,2437
2257,funny,166a9n,It's always a bummer waiting to buy child slav...,1282,"Go home, fashion. You're drunk.",,1983
1021,pics,16fjjm,Do not stand at my grave and weep I am not th...,1221,Aaron Shwartz- Reddit Co-founder R.I.P,,2740
...,...,...,...,...,...,...,...
8257,RedditLaqueristas,17fiju,"Ok, I'm inspired. My Wallis has been collectin...",1,Butter London's Wallis is such an interesting ...,,154
8252,leagueoflegends,18qy3w,"oh man, seeing those screenshots made me ache ...",1,MinionInvaders - a fan-made game,"Hello Reddit,I just updated MinionInvaders, a ...",154
8250,progresspics,16klde,THE FUQ I just C!!! It's like a total metamorp...,1,"M 1 year 5'7"" 225-145",Then Now Exactly 1 year apartEdit: Will Post r...,154
8245,Minecraft,17t8rt,I like the village and the Super Craft bros in...,1,New PvP map Im working on,,154


In [24]:
merged7.transpose().to_json('data/comments_submission_ask_all.json', index = 'true') 

3. remove posts with title + selftext less than 10 words

In [61]:
merged2 = merged[(merged['title'] + merged['selftext']).str.split().str.len().gt(20)]
merged2

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
1,pics,7m28mc,It's these kind of posts that restore some of ...,1,She took me in 15 years ago and no one makes m...,,171795
3,pics,6f0lr7,I understand trying to uplift people with the ...,1,You can destroy a country. But you can't destr...,,129553
4,pics,808ddp,Hope you put this on r/uplift.,1,Wife is a kindergarten teacher. A couple of th...,,119527
5,todayilearned,6po7fz,"I wish we could do this on a global scale, upl...",2,TIL when a millionaire gave everyone in a Flor...,,117421
7,AskReddit,8pks1u,The sympathy in this thread is comforting.The ...,3,Suicide Prevention Megathread,With the news today of the passing of the amaz...,104195
...,...,...,...,...,...,...,...
20479,DnD,7lc6f8,"Using strictly PHB, whichever class can buy a ...",2,3.5 What's the optimal and sub optimal stuff i...,I played 3.5 for years but honestly wasn't act...,1
20481,relationships,6wpt41,My parents divorced when I was very young and ...,2,My [24F] mother [51F] is addicted to gaming an...,**Background:**\n\nMy mother was successful ac...,1
20482,Drugs,27x3cd,It helped me insofar as it better allowed me t...,4,Using LSD or Psilocybin Mushrooms to find out ...,I'm an incoming senior to high school.\n\nI sm...,1
20483,offmychest,91nzgb,I can’t even begin to imagine how you feel. On...,2,my dad hates me,my mom called me to come over because she made...,1


4. remove posts with title + selftext greater than 100 words

In [63]:
merged3 = merged2[(merged2['title'] + merged2['selftext']).str.split().str.len().lt(201)]
merged3

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
1,pics,7m28mc,It's these kind of posts that restore some of ...,1,She took me in 15 years ago and no one makes m...,,171795
3,pics,6f0lr7,I understand trying to uplift people with the ...,1,You can destroy a country. But you can't destr...,,129553
4,pics,808ddp,Hope you put this on r/uplift.,1,Wife is a kindergarten teacher. A couple of th...,,119527
5,todayilearned,6po7fz,"I wish we could do this on a global scale, upl...",2,TIL when a millionaire gave everyone in a Flor...,,117421
7,AskReddit,8pks1u,The sympathy in this thread is comforting.The ...,3,Suicide Prevention Megathread,With the news today of the passing of the amaz...,104195
...,...,...,...,...,...,...,...
20476,Com320_ASU_Spring2014,24k2fv,I think that the world presented in Idiocracy ...,1,Last Extra Credit Discussion Board,Class - if you want to earn a few more (maximu...,1
20478,adventism,3i58sa,Does the music uplift and draw one to a higher...,1,Genuine Question about Music,"For some backstory, I am not an adventist. I a...",1
20479,DnD,7lc6f8,"Using strictly PHB, whichever class can buy a ...",2,3.5 What's the optimal and sub optimal stuff i...,I played 3.5 for years but honestly wasn't act...,1
20483,offmychest,91nzgb,I can’t even begin to imagine how you feel. On...,2,my dad hates me,my mom called me to come over because she made...,1


### Remove posts from certain subreddits: pics, videos, Music ..

In [64]:
list_subreddits_to_remove = ["pics", "movies", "Music", "videos", "trapmuzik"]

merged4 = merged3[~merged3.subreddit.isin(list_subreddits_to_remove)]

merged5 = merged3[merged3.subreddit.isin(list_subreddits_to_remove)]

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
5,todayilearned,6po7fz,"I wish we could do this on a global scale, upl...",2,TIL when a millionaire gave everyone in a Flor...,,117421
7,AskReddit,8pks1u,The sympathy in this thread is comforting.The ...,3,Suicide Prevention Megathread,With the news today of the passing of the amaz...,104195
12,IAmA,80ow6w,Hello Bill! First thank you and your wife Meli...,1,"I’m Bill Gates, co-chair of the Bill &amp; Mel...",I’m excited to be back for my sixth AMA.Here’s...,94575
15,todayilearned,8bwtmt,"I'd been feeling bloated all morning and, as m...",59,"TIL on the set of The Princess Bride, André th...",,91310
30,Showerthoughts,95m643,"Reincarnation is real, Exhibit A : I think I'l...",1,"If reincarnation is real, then maybe flies and...",,74457
...,...,...,...,...,...,...,...
20476,Com320_ASU_Spring2014,24k2fv,I think that the world presented in Idiocracy ...,1,Last Extra Credit Discussion Board,Class - if you want to earn a few more (maximu...,1
20478,adventism,3i58sa,Does the music uplift and draw one to a higher...,1,Genuine Question about Music,"For some backstory, I am not an adventist. I a...",1
20479,DnD,7lc6f8,"Using strictly PHB, whichever class can buy a ...",2,3.5 What's the optimal and sub optimal stuff i...,I played 3.5 for years but honestly wasn't act...,1
20483,offmychest,91nzgb,I can’t even begin to imagine how you feel. On...,2,my dad hates me,my mom called me to come over because she made...,1


In [47]:
merged5.transpose().to_json('data/comments_submission_pics_movies_music_videos.json', index = 'true') 

### Remove Quotes , titles, ads ..

subreddit: IAmA, 

In [55]:
query = "IAmA"
merged4 = merged4[~merged4.subreddit.str.contains(query)]
merged4

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score
128,todayilearned,16dsdl,"Man, this has inspired me to create a new camp...",1,TIL that after needing 13 liters of blood for ...,,8195
129,todayilearned,16dsdl,this has inspired me to donate blood.,1,TIL that after needing 13 liters of blood for ...,,8195
134,gaming,17iw9q,Thanks for posting - I've shared it with my so...,2,So we built Kings Landing in Minecraft. Heres ...,,6241
135,IAmA,16mq0g,"You have done so much with your life, I'm 16, ...",18,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651
136,IAmA,16mq0g,What inspired you to run for governor?,4,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651
...,...,...,...,...,...,...,...
14213,AskReddit,17zly4,Being that Reddit is the source for original (...,1,For you creative writers: Any hints or product...,I consider myself an aspiring writer who is ac...,1
14215,trance,17gvzy,It's mostly just uplifting trance in a weird m...,3,Trance with an eerie vibe?,I want some more of this style of trance. Some...,1
14216,trance,17gvzy,It's mostly just uplifting trance in a weird m...,3,Trance with an eerie vibe?,I want some more of this style of trance. Some...,1
14219,explainlikeimfive,183auv,"It's largely a myth, started by a book called ...",3,Please explain nanotechnology to me.,I have recently been researching about nanotec...,1


In [66]:
merged4.transpose().to_json('data/comments_submission_all_round3.json', index = 'true') 

### Interesting findings

1. "What fictional person has had the biggest impact on your life and how?" title on AskReddit - has many good inspiring comments

2. subreddits: pics, movies, videos have comments with inspiring movies 

## Analysis of topics

1. Subreddits

In [39]:
from collections import Counter
all_subreddits = list(merged4["subreddit"])
Counter(all_subreddits).most_common()

[('AskReddit', 876),
 ('IAmA', 328),
 ('todayilearned', 132),
 ('gaming', 94),
 ('atheism', 76),
 ('funny', 60),
 ('Random_Acts_Of_Amazon', 54),
 ('RedditLaqueristas', 51),
 ('progresspics', 49),
 ('trees', 47),
 ('WTF', 46),
 ('Christianity', 44),
 ('loseit', 40),
 ('keto', 37),
 ('politics', 36),
 ('leagueoflegends', 35),
 ('malefashionadvice', 35),
 ('DIY', 34),
 ('Games', 34),
 ('Minecraft', 31),
 ('GetMotivated', 28),
 ('writing', 28),
 ('books', 27),
 ('Guitar', 25),
 ('gamedev', 25),
 ('SketchDaily', 25),
 ('nfl', 24),
 ('MakeupAddiction', 24),
 ('comicbooks', 24),
 ('rpg', 24),
 ('AskHistorians', 24),
 ('aww', 23),
 ('tattoos', 22),
 ('NoFap', 22),
 ('exmormon', 22),
 ('Metal', 22),
 ('nba', 21),
 ('Fitness', 20),
 ('mylittlepony', 20),
 ('running', 20),
 ('SquaredCircle', 20),
 ('booksuggestions', 20),
 ('space', 19),
 ('CFB', 19),
 ('TwoXChromosomes', 19),
 ('MensRights', 19),
 ('AskWomen', 19),
 ('MLPLounge', 19),
 ('DebateReligion', 18),
 ('gonewild', 17),
 ('masseffect', 1

In [76]:
substring_list = ["Reddit, what is the happiest fact you know? I need something to make me feel better.", 
                  "Hey Reddit, what is a harsh truth that helped you change your life? This article has changed my life in so many ways",
                "What kind of teacher did / might have inspired you in high school? i.e. What kind of teacher would you want your children to have? What made your teacher awesome / what made your teacher terrible that you would NOT want to see? Who inspired you to be a better person and/or pursue whatever it is you're pursuing?",
                  "Who is your hero and why? Is it your dad, a teacher or a random Good Samaritan that changed your life? There's got to be some interesting stories out there!",
                  "Who inspired you and how? Which Person, no matter if famous or not famous, dead or alive has inspired the way you live/ your life and how?",
                    "Reddit I'm feeling down, what are some subreddits that are sure to brighten by day?",
                  "How do you help yourself get out of depression? We've all been there.Personally, I write down what I'm feeling. No matter what it is, I just write it down.That and always remembering, tomorrow the sun will rise.",
                  "What is the most inspiring thing you have ever seen or heard? Lots of things are going on, and I need some inspiration! Hopefully some of you can get inspired from some of the responses!",
                  "I'm a TA. What are some fun ice breakers or games I can dish out to my students on their first day? For the love of God, G-rated please and thank you.",
                 "It is my cakeday today, and I would like to ask you redditors a question: What inspires you?",
                 "When was the last time you felt inspired? Who or what inspired you?",
                 ]
                  
merged_sth = merged4[(merged4['title'] + " " + merged4['selftext']).str.contains("Who is your hero and why?")]
merged_sth

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score,freq
12822,AskReddit,184vmk,Some pretty inspiring posts here!,1,Who is your hero and why?,"Is it your dad, a teacher or a random Good Sam...",4,511


In [58]:

merged4 = merged4.assign(freq=merged4.apply(lambda x: merged4.subreddit.value_counts().to_dict()[x.subreddit], axis=1)) \
        .sort_values(by=['freq', 'subreddit'], ascending=[False, True])

Unnamed: 0,subreddit,id,comment_body,score_com,title,selftext,score,freq
135,IAmA,16mq0g,"You have done so much with your life, I'm 16, ...",18,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651,760
136,IAmA,16mq0g,What inspired you to run for governor?,4,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651,760
137,IAmA,16mq0g,Whats in the future for you? Do you still lift...,2,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651,760
138,IAmA,16mq0g,"I aspire to be like you, your role in pumping ...",2,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651,760
139,IAmA,16mq0g,Mr. Schwarzenegger... You have completely insp...,2,IAmArnold... Ask me anything.,"Former Mr. Olympia, Conan, Terminator, and Gov...",5651,760
...,...,...,...,...,...,...,...,...
10453,wollongong,163trx,"Not too bad. Finally submitted my PhD thesis, ...",3,This subreddit is way too dead. Let's talk and...,Hey. This subreddit doesn't get enough love. H...,13,1
13388,woodworking,16yo9i,"For a workbench, I highly recommend something ...",1,Help set up my workshop,I recently moved across country and bought a n...,2,1
13339,writers,16gvlj,Two of the most important things are a good pl...,1,Writers of Reddit. I have a few questions for ...,Hello everyone! I'm an architecture student an...,2,1
6611,xbox360,16g9fk,Fallout New Vegas probably doesn't seem very b...,2,What are the most visually appealing Xbox 360 ...,"What are some of the best visually, artwork, e...",77,1


In [60]:
from collections import Counter
Counter(merged4["subreddit"]).most_common()

[('IAmA', 760),
 ('AskReddit', 511),
 ('Random_Acts_Of_Amazon', 104),
 ('loseit', 90),
 ('todayilearned', 86),
 ('atheism', 68),
 ('Christianity', 60),
 ('keto', 47),
 ('writing', 47),
 ('NoFap', 46),
 ('gamedev', 45),
 ('Guitar', 44),
 ('malefashionadvice', 38),
 ('depression', 36),
 ('leagueoflegends', 35),
 ('trees', 35),
 ('books', 34),
 ('nfl', 34),
 ('gaming', 32),
 ('AskHistorians', 31),
 ('rpg', 30),
 ('MLPLounge', 29),
 ('Games', 28),
 ('Metal', 27),
 ('WeAreTheMusicMakers', 27),
 ('exmormon', 27),
 ('booksuggestions', 26),
 ('AskMen', 24),
 ('GetMotivated', 24),
 ('RandomActsOfPolish', 24),
 ('DebateReligion', 23),
 ('DnD', 23),
 ('progresspics', 23),
 ('AskWomen', 22),
 ('SketchDaily', 22),
 ('TwoXChromosomes', 22),
 ('femalefashionadvice', 22),
 ('nba', 22),
 ('MakeupAddiction', 21),
 ('WTF', 21),
 ('comicbooks', 21),
 ('Fitness', 20),
 ('mylittlepony', 20),
 ('politics', 20),
 ('RedditLaqueristas', 19),
 ('running', 19),
 ('DIY', 18),
 ('Fantasy', 18),
 ('photography', 18)

### Get all comments for submission id

In [92]:
list_submission_ids = ["16wirc", "162b0z" ,"15vob8", "184vmk", "17byde", "181c81", "181zkt", "16lito", "18slc9", "17ok2p"]
inspir_ids=["17ok2p", "16lito", "189nuu", "17byde"]
column_values = ['body','parent_id','subreddit_id', 'score', 'subreddit']

print(len(list_submission_ids))

str_list_ids = ",".join(inspir_ids)

url = f"https://api.pushshift.io/reddit/comment/search/?link_id=" + str_list_ids + "&subreddit=AskReddit"
print(url)
df_all=pd.DataFrame(columns=column_values)
    
    r = requests.get(url,  timeout=30)
    # Check the status code, if successful, process the data
    if r.status_code == 200:
        response = json.loads(r.text)
        data = response['data']


10
https://api.pushshift.io/reddit/comment/search/?link_id=17ok2p,16lito,189nuu,17byde&subreddit=AskReddit&limit=1000


In [102]:
def fetchObjects(**kwargs):
    # Default paramaters for API query
    params = {
        "sort_type":"created_utc",
        "sort":"asc",
        "size":10000
        }

    # Add additional paramters based on function arguments
    for key,value in kwargs.items():
        params[key] = value
    
    list_submission_ids = ["16wirc", "162b0z" ,"15vob8", "184vmk", "17byde", "181c81", "181zkt", "16lito", "18slc9", "17ok2p", "189nuu"]
    inspir_ids=["17ok2p", "16lito", "189nuu", "17byde"]
    str_list_ids = ",".join(list_submission_ids)
        
    # Perform an API request
    r = requests.get("https://api.pushshift.io/reddit/comment/search/?link_id=" + str_list_ids, params=params, timeout=30)

    # Check the status code, if successful, process the data
    if r.status_code == 200:
        response = json.loads(r.text)
        data = response['data']
        sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
        return sorted_data_by_id


In [105]:
def extract_reddit_data(**kwargs):
    # Speficify the start timestamp
    max_created_utc = 1356998400  # 01/01/2013 @ 12:00am (UTC)
    max_id = 0
    
    for key,value in kwargs.items():
        if key == "columns":
            column_values = value
    
    df_all=pd.DataFrame(columns=column_values)
    # While loop for recursive function
    while 1:
        nothing_processed = True
        # Call the recursive function
        objects = fetchObjects(**kwargs,after=max_created_utc)
        print(len(objects))
        df_new = pd.DataFrame(objects, columns = column_values)
        df_all = pd.concat([df_all, df_new])

        # Loop the returned data, ordered by date
        for object in objects:
            id = int(object['id'],36)
            if id > max_id:
                nothing_processed = False
                created_utc = object['created_utc']
                max_id = id
                if created_utc > max_created_utc: max_created_utc = created_utc

        # Exit if nothing happened
        if nothing_processed: return df_all.reset_index(drop=True).sort_values(by=['score'], ascending=False)
        max_created_utc -= 1

        # Sleep a little before the next recursive function call
        time.sleep(.5)

In [106]:
df_submission = extract_reddit_data(subreddit="AskReddit", columns = ['body','parent_id','subreddit_id', 'score', 'subreddit'])

100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
66
1
(9267, 5)


Unnamed: 0,body,parent_id,subreddit_id,score,subreddit
54,Otters hold hands when sleeping so they dont d...,t3_16wirc,t5_2qh1i,3830,AskReddit
281,"\n\n&gt;Reddit, what is the happiest fact you ...",t1_c800pji,t5_2qh1i,3633,AskReddit
88,"On 17 August 1999, a major earthquake struck T...",t3_16wirc,t5_2qh1i,3168,AskReddit
96,I'm checking google for a picture now.\nEDIT: ...,t1_c8006c2,t5_2qh1i,3017,AskReddit
157,[The West Wing](http://www.youtube.com/watch?f...,t3_16wirc,t5_2qh1i,2980,AskReddit
...,...,...,...,...,...
101,"Otters rape baby seals.\n\nEdit: C'mon, since ...",t1_c8006c2,t5_2qh1i,-17,AskReddit
410,Damn straight! Have an upvote.,t1_c800nmz,t5_2qh1i,-19,AskReddit
5008,WOW it's the first time I've ever seen this me...,t1_c804925,t5_2qh1i,-24,AskReddit
49,I'm one day closer to death.,t3_16wirc,t5_2qh1i,-24,AskReddit


In [110]:
df_submission = df_submission[~(df_submission['score'] <= 0)]
df_submission

Unnamed: 0,body,parent_id,subreddit_id,score,subreddit
54,Otters hold hands when sleeping so they dont d...,t3_16wirc,t5_2qh1i,3830,AskReddit
281,"\n\n&gt;Reddit, what is the happiest fact you ...",t1_c800pji,t5_2qh1i,3633,AskReddit
88,"On 17 August 1999, a major earthquake struck T...",t3_16wirc,t5_2qh1i,3168,AskReddit
96,I'm checking google for a picture now.\nEDIT: ...,t1_c8006c2,t5_2qh1i,3017,AskReddit
157,[The West Wing](http://www.youtube.com/watch?f...,t3_16wirc,t5_2qh1i,2980,AskReddit
...,...,...,...,...,...
5587,Theres always money in the banana stand.,t1_c803v4g,t5_2qh1i,1,AskReddit
5234,As well as Mac and Sweet Dee. Dude gets paid t...,t1_c80283l,t5_2qh1i,1,AskReddit
2380,I wonder if comedy clubs hire people to laugh ...,t1_c800vgt,t5_2qh1i,1,AskReddit
5220,A better word: spumiferous - adj. describing s...,t1_c801f29,t5_2qh1i,1,AskReddit


In [111]:
for index in df_submission.index:
    selftext = df_submission.loc[index,'body']
    selftext = re.sub(r'http\S+', '', selftext)
    selftext = selftext.replace("\n", "")
    selftext = re.sub('\s{2,}', ' ', selftext)  # To remove more than one space
    df_submission.loc[index,'body'] = selftext.rstrip() #remove new line
df_submission

Unnamed: 0,body,parent_id,subreddit_id,score,subreddit
54,Otters hold hands when sleeping so they dont d...,t3_16wirc,t5_2qh1i,3830,AskReddit
281,"&gt;Reddit, what is the happiest fact you know...",t1_c800pji,t5_2qh1i,3633,AskReddit
88,"On 17 August 1999, a major earthquake struck T...",t3_16wirc,t5_2qh1i,3168,AskReddit
96,I'm checking google for a picture now.EDIT: So...,t1_c8006c2,t5_2qh1i,3017,AskReddit
157,"[The West Wing](A few years ago, Voyager 1 cro...",t3_16wirc,t5_2qh1i,2980,AskReddit
...,...,...,...,...,...
5587,Theres always money in the banana stand.,t1_c803v4g,t5_2qh1i,1,AskReddit
5234,As well as Mac and Sweet Dee. Dude gets paid t...,t1_c80283l,t5_2qh1i,1,AskReddit
2380,I wonder if comedy clubs hire people to laugh ...,t1_c800vgt,t5_2qh1i,1,AskReddit
5220,A better word: spumiferous - adj. describing s...,t1_c801f29,t5_2qh1i,1,AskReddit


In [115]:
df_submission2 = df_submission[(df_submission['body']).str.split().str.len().gt(10)]
df_submission2 = df_submission2[(df_submission['body']).str.split().str.len().lt(201)]
df_submission2 = df_submission2.rename(columns={'body': "comment_body",'parent_id': "id", "score":"score_com"})
df_submission2

  df_submission2 = df_submission2[(df_submission['body']).str.split().str.len().lt(201)]


Unnamed: 0,comment_body,id,subreddit_id,score_com,subreddit
54,Otters hold hands when sleeping so they dont d...,t3_16wirc,t5_2qh1i,3830,AskReddit
281,"&gt;Reddit, what is the happiest fact you know...",t1_c800pji,t5_2qh1i,3633,AskReddit
88,"On 17 August 1999, a major earthquake struck T...",t3_16wirc,t5_2qh1i,3168,AskReddit
157,"[The West Wing](A few years ago, Voyager 1 cro...",t3_16wirc,t5_2qh1i,2980,AskReddit
164,Despite high infant mortality rates and lower ...,t3_16wirc,t5_2qh1i,2679,AskReddit
...,...,...,...,...,...
5235,I think you missed the happy part. unless your...,t1_c805abc,t5_2qh1i,1,AskReddit
5558,"So A termite walks into the bar and asks ""Is t...",t1_c801s5v,t5_2qh1i,1,AskReddit
5234,As well as Mac and Sweet Dee. Dude gets paid t...,t1_c80283l,t5_2qh1i,1,AskReddit
2380,I wonder if comedy clubs hire people to laugh ...,t1_c800vgt,t5_2qh1i,1,AskReddit


In [116]:
df_submission2.transpose().to_json('data/comments_for_AskReddit_specific.json', index = 'true')