# Identifying Potential Customers Based on Reddit Posts 

#### By: Julia Kelman: [GitHub](https://git.generalassemb.ly/julia-kelman/)

## Problem Statement

### Loading Libraries 

In [5]:
import pandas as pd
import datetime as dt
import time
import requests

### Data Gathering Function

In [6]:
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 5):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}"  
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500" 
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

# Function created by Mahdi Shadkam-Farrokhi

### OCD Reddit Data 

In [7]:
# Gathering Data 
ocd_results = query_pushshift("OCD")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=OCD&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=OCD&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=OCD&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=OCD&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=OCD&size=500&after=150d
Query Complete!


In [8]:
# Checking how many observations we have 
ocd_results.shape

(2238, 9)

In [9]:
ocd_results.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,Anyone else kinda grateful for this downtime?,I've been struggling with past event OCD again...,OCD,1585173674,J_Deedubze_W,6,1,True,2020-03-25
1,Guilt OCD and the coronavirus,Anyone dealing with anxiety/guilt spirals that...,OCD,1585173980,StudBoi69,3,1,True,2020-03-25
2,"Constant fear of my rectum ""leaking"" (anyone e...","It feels weird to put it into words, but a big...",OCD,1585174743,Throwaway850944,1,1,True,2020-03-25
4,"I thought I could handle having sex, but I app...","I thought I was getting better. You see, I’m a...",OCD,1585177666,electr0_mel0n,1,1,True,2020-03-25
5,Anyone currently procrastinating?,I’m having a bad day but trying to have a good...,OCD,1585177686,soll_lluna,2,1,True,2020-03-25


### ASD Reddit Data

In [10]:
# Gathering ASD Data 
asd_results = query_pushshift("autism")

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=autism&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=autism&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=autism&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=autism&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=autism&size=500&after=150d
Query Complete!


In [11]:
# Checking how many observations we have 
asd_results.shape

(1723, 9)

In [12]:
asd_results.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,What did you said to my friends,[removed],autism,1585173143,Hansfredian,0,1,True,2020-03-25
1,Rip my sanity during 9 more days of isolation.,Yesterday my sister and I started a 10 day iso...,autism,1585173211,caffeinatedpixie,2,1,True,2020-03-25
2,Researching Society's views on Autism for a sc...,[removed],autism,1585173626,Arya_Park,0,1,True,2020-03-25
4,The boy who wrote threatening message about hi...,[removed],autism,1585174148,Hansfredian,0,1,True,2020-03-25
9,"Coronavirus (I know, I’m sick of hearing about...",I know it’s all people are hearing about right...,autism,1585180708,aestheticautistic,5,1,True,2020-03-25


### Combining and Saving Data

In [13]:
data = pd.concat([ocd_results, asd_results])

In [14]:
data.shape

(3961, 9)

In [17]:
data.to_csv("../data/data.csv", index = False)

In [16]:
data.dtypes

title           object
selftext        object
subreddit       object
created_utc      int64
author          object
num_comments     int64
score            int64
is_self           bool
timestamp       object
dtype: object

## References

[Autism Reddit](https://www.reddit.com/r/autism/)  
[OCD Reddit](https://www.reddit.com/r/OCD/)