# Lockdown baking - part 1

A project for scraping and analysing data from reddit (r/Sourdough) to explore baking trends during 2020.

Part 1: Webscraping

- setting up reddit API account
- creating functions for extracting data using [praw]( https://praw.readthedocs.io/en/latest/code_overview/models/submission.html) and [pushshift.io](https://pushshift.io/api-parameters/) 
- storing result in a csv file 

# Setup

In [1]:
## set root folder
import os 
os.chdir("..")

In [2]:
import pandas as pd
import math
import numpy as np
import configparser #to read config file
#import pickle #to store objects

# libraries to work with pushshift
import requests
import json

#library to work with praw
import praw

#import itertools

import time
from datetime import datetime, timedelta

# Access pushshift.io

Functions and approach based on this article: https://rareloot.medium.com/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563

More resources:

- Pushshift documentation: https://github.com/pushshift/api
- Pushshift data results example: https://api.pushshift.io/reddit/search/submission/?q=screenshot&after=1514764800&before=1517443200&subreddit=PS4


Changed the script slightly:

- change size in pushshift request to 100 (as that seems to be the limit now)
- make code run more slowly and introduce sleeping time to avoid server time out (due to running too many requests) 
- Store variables in a dictionary
- Add more variables to scrape

## Scrape data

In [37]:
import pandas as pd
import requests
import json
import math

def getPushshiftData(start_at, end_at, subreddit):
    url = 'https://api.pushshift.io/reddit/search/submission?&size=100&after='+str(start_at)+'&before='+str(end_at)+'&subreddit='+str(subreddit)
    r = requests.get(url)
    print('server status:', r.status_code)
    
    # if page available, run code as normal
    if r.status_code == 200:
        data = json.loads(r.text)
        return data['data']
    
    # if page not able to load, wait 1 min and try again
    else:
        print("sleep 60")
        time.sleep(60)
        url = 'https://api.pushshift.io/reddit/search/submission?&size=100&after='+str(start_at)+'&before='+str(end_at)+'&subreddit='+str(subreddit)
        r = requests.get(url)
        data = json.loads(r.text)
        return data['data']
        
#dictionary to store values in
post_dict = { "id" : [], 
             "score" :[],
            "created_utc":[],
             "title":[],
             "num_comments" : [],
             "can_mod_post": [],
             "author":[]
            }

#define search parameters
subreddit='Sourdough'
start_at = str(math.ceil(datetime(2020, 1, 1, 0, 0, 0).timestamp()))
end_at = str(math.floor(datetime(2020, 12, 31, 23, 59, 59).timestamp()))

##Run with test dates before, to check if working before attempting the whole data request
#start_at = str(math.ceil(datetime(2020, 12, 1, 0, 0, 0).timestamp()))
#end_at = str(math.floor(datetime(2020, 12, 5, 23, 59, 59).timestamp()))

#retrieve data given the parameters
data = getPushshiftData(start_at, end_at, subreddit)

# Will run until all posts have been gathered from the 'start_at' date until the 'end_at' date
while len(data) > 0:
    for submission in data:
        post_dict["id"].append(submission["id"])
        post_dict["title"].append(submission["title"])
        post_dict["created_utc"].append(submission["created_utc"])
        post_dict["score"].append(submission["score"])
        post_dict["num_comments"].append(submission["num_comments"])
        post_dict["can_mod_post"].append(submission["can_mod_post"])
        post_dict["author"].append(submission["author"])
        
    # Calls getPushshiftData() with the created date of the last submission
    print('start again at:', data[-1]['created_utc'])
    print('data loaded:', len(post_dict["title"]))
    time.sleep(15)
    data = getPushshiftData(subreddit=subreddit, start_at=data[-1]['created_utc'], end_at=end_at)

server status:  200
start at 1578019458
data loaded 100
server status:  200
start at 1578214580
data loaded 200
server status:  200
start at 1578345963
data loaded 300
server status:  200
start at 1578522072
data loaded 400
server status:  200
start at 1578758697
data loaded 500
server status:  200
start at 1578858068
data loaded 600
server status:  200
start at 1579020215
data loaded 700
server status:  200
start at 1579224546
data loaded 800
server status:  200
start at 1579427066
data loaded 900
server status:  200
start at 1579530199
data loaded 1000
server status:  200
start at 1579711714
data loaded 1100
server status:  200
start at 1579891203
data loaded 1200
server status:  200
start at 1580049402
data loaded 1300
server status:  200
start at 1580153640
data loaded 1400
server status:  200
start at 1580345935
data loaded 1500
server status:  200
start at 1580552067
data loaded 1600
server status:  200
start at 1580670981
data loaded 1700
server status:  200
start at 1580795892


server status:  200
start at 1588385712
data loaded 14400
server status:  200
start at 1588421377
data loaded 14500
server status:  200
start at 1588442938
data loaded 14600
server status:  200
start at 1588466852
data loaded 14700
server status:  200
start at 1588514375
data loaded 14800
server status:  200
start at 1588531884
data loaded 14900
server status:  200
start at 1588549451
data loaded 15000
server status:  200
start at 1588589781
data loaded 15100
server status:  200
start at 1588616731
data loaded 15200
server status:  200
start at 1588658945
data loaded 15300
server status:  200
start at 1588702096
data loaded 15400
server status:  200
start at 1588735116
data loaded 15500
server status:  200
start at 1588777746
data loaded 15600
server status:  200
start at 1588810116
data loaded 15700
server status:  200
start at 1588867719
data loaded 15800
server status:  200
start at 1588894841
data loaded 15900
server status:  200
start at 1588942318
data loaded 16000
server status:

server status:  200
start at 1595455138
data loaded 28500
server status:  200
start at 1595532816
data loaded 28600
server status:  200
start at 1595623613
data loaded 28700
server status:  200
start at 1595708269
data loaded 28800
server status:  200
start at 1595782311
data loaded 28900
server status:  200
start at 1595854523
data loaded 29000
server status:  200
start at 1595944501
data loaded 29100
server status:  200
start at 1596029306
data loaded 29200
server status:  200
start at 1596123589
data loaded 29300
server status:  200
start at 1596217891
data loaded 29400
server status:  200
start at 1596315346
data loaded 29500
server status:  200
start at 1596389925
data loaded 29600
server status:  200
start at 1596471961
data loaded 29700
server status:  200
start at 1596561917
data loaded 29800
server status:  200
start at 1596650500
data loaded 29900
server status:  200
start at 1596758773
data loaded 30000
server status:  200
start at 1596849552
data loaded 30100
server status:

## Save result to file

In [52]:
#convert dictionary to dataframe
post_df = pd.DataFrame(post_dict)

#save data to csv file
post_df.to_csv("post_df_2021-01-07.csv")

post_df[:10]

Unnamed: 0,id,score,created_utc,title,num_comments,can_mod_post,author
0,eibhvl,1,1577839131,"First attempt at a starter, really hope I mana...",5,False,coentertainer
1,eibvur,1,1577841129,Skillet &amp; Dutch Oven Sourdough in the rain...,0,False,Richness69
2,eiby7m,1,1577841483,My last bread of 2019. I used Brad and Claire’...,0,False,canioli019
3,eictkk,1,1577846281,I started baking in September and I have never...,0,False,singular-chip
4,eidmqm,1,1577851082,Sourdough Books,3,False,TheNightBaker97
5,eidtic,1,1577852213,Analyzing sourdough?,1,False,amisanyal
6,eidxpd,1,1577852956,Ginger tumeric loaf to guide me out of the decade,2,False,bleuxballs
7,eidyxu,1,1577853173,Behold Bread Majors. He will incite the Rocky ...,3,False,ClandestineOni
8,eifrvq,1,1577864698,Last loaves of the year.,0,False,gorpz
9,eigw2g,1,1577873535,Wheat flour starter vs rye starter,9,False,bacafreak


## Check pushshift data

In [7]:
## Load data if notebook closed
post_df = pd.read_csv(".\\data\scraped\post_df_2021-01-07.csv")

In [8]:
# explore dataset
post_df["score"].describe()

count    41309.000000
mean         3.647002
std         33.900748
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max       2605.000000
Name: score, dtype: float64

**Issue detected:** max score given as 2,605 --> however there are higher ones that were posted in summer 2020 with over 4k votes

**Investigation:**

After checking again, it seems like the pushshift API is incorrect for some score data. Some scores match what is shown on Reddit, while others don't match and show only 1.

To fix this, I'll have to use praw to pull the correct score by submission id directly through the Reddit API.

# Get correct data through reddit using praw

## Setting up reddit API connection

In [3]:
# retrieve details from config file
def get_config_values(config_file, section):
    config = configparser.ConfigParser()
    config.read(config_file)

    return {
        "username": config.get(section, 'username'),
        "password": config.get(section, 'password'),
        "user_agent": config.get(section, 'user_agent'),
        "client_id": config.get(section, 'client_id'),
        "client_secret": config.get(section, 'client_secret'),
    }

details = get_config_values("reddit-config.cfg", "reddit-config")

In [4]:
# setup praw Reddit connection
reddit = praw.Reddit(client_id = details["client_id"], 
                     client_secret = details["client_secret"], 
                     user_agent = details["user_agent"], 
                     username = details["username"], 
                     password = details["password"]) 
  
# to verify whether the instance is authorised instance or not 
print(reddit.read_only)

False


## Test connection

In [5]:
# to find the top most submission in the subreddit "sourdough" 
subreddit = reddit.subreddit('sourdough') 
  
for submission in subreddit.top(limit = 1): 
    # displays the submission title 
    print("Title: ", submission.title)   
  
    # displays the net upvotes of the submission 
    print("Score: ", submission.score)   
  
    # displays the submission's ID 
    print("ID: ", submission.id)    
  
    # displays the url of the submission 
    print("URL: ", submission.url) 
    
    # displays when the submission was created in unix time
    print("Created: ", submission.created_utc)  
    
    # displays number of comments to the submission
    print("Number of comments: ", submission.num_comments) 

Title:  Here’s another video of me shaping sourdough. I added some music this time because baking is rock ’n roll.
Score:  4430
ID:  glzuwy
URL:  https://v.redd.it/t8jaoor0giz41
Created:  1589801997.0
Number of comments:  214


## Pull data

In [10]:
post_df["id"][:10]

0    eibhvl
1    eibvur
2    eiby7m
3    eictkk
4    eidmqm
5    eidtic
6    eidxpd
7    eidyxu
8    eifrvq
9    eigw2g
Name: id, dtype: object

In [11]:
# Generate list including Submissions and their id to then get the rest of the data from praw
posts_from_reddit = []

for submission_id in post_df["id"]:
    submission = reddit.submission(id=submission_id)     
    posts_from_reddit.append(submission)  

print(len(posts_from_reddit))
print(posts_from_reddit[:10])

41309
[Submission(id='eibhvl'), Submission(id='eibvur'), Submission(id='eiby7m'), Submission(id='eictkk'), Submission(id='eidmqm'), Submission(id='eidtic'), Submission(id='eidxpd'), Submission(id='eidyxu'), Submission(id='eifrvq'), Submission(id='eigw2g')]


In [43]:
# pull score and permalink through praw
praw_data = { "score" : [],
             "permalink": [],
            "id" : []}

# print checkpoints to see the progress of our request as it will take quite a while
checkpoints = np.arange(0, len(posts_from_reddit), 500)

for submission in posts_from_reddit:
    praw_data["score"].append(submission.score)
    praw_data["id"].append(submission.id)
    praw_data["permalink"].append(submission.permalink)
    
    if len(praw_data["score"]) in checkpoints:
        print(len(praw_data["score"]))
    
#display result
print(len(praw_data["score"]))
print(praw_data["score"][:10])
print(praw_data["id"][:10])
print(praw_data["permalink"][:10])

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000


ServerError: received 503 HTTP response

In [44]:
len(praw_data["score"])

15100

In [46]:
praw_data

{'score': [11,
  12,
  11,
  1,
  5,
  1,
  21,
  39,
  1,
  2,
  8,
  280,
  32,
  8,
  97,
  37,
  21,
  36,
  133,
  47,
  9,
  2,
  15,
  9,
  8,
  12,
  346,
  10,
  54,
  5,
  60,
  9,
  26,
  24,
  1,
  97,
  3,
  1,
  100,
  62,
  8,
  20,
  16,
  6,
  2,
  11,
  10,
  1,
  1,
  91,
  29,
  58,
  20,
  31,
  376,
  81,
  8,
  36,
  29,
  23,
  35,
  6,
  10,
  8,
  34,
  0,
  44,
  2,
  1,
  2,
  15,
  0,
  3,
  82,
  2,
  1,
  298,
  2,
  183,
  6,
  113,
  51,
  13,
  2,
  1,
  3,
  29,
  22,
  86,
  3,
  1,
  44,
  3,
  1,
  3,
  79,
  36,
  93,
  4,
  35,
  1,
  5,
  177,
  2,
  1,
  3,
  50,
  52,
  2,
  3,
  1,
  65,
  200,
  846,
  14,
  34,
  1,
  44,
  108,
  4,
  5,
  7,
  23,
  10,
  12,
  4,
  15,
  5,
  8,
  2,
  11,
  275,
  5,
  66,
  2,
  3,
  3,
  5,
  209,
  12,
  4,
  35,
  38,
  5,
  2,
  30,
  45,
  3,
  513,
  30,
  6,
  13,
  74,
  33,
  8,
  42,
  12,
  380,
  1,
  45,
  40,
  5,
  49,
  4,
  2,
  19,
  2,
  5,
  2,
  2,
  1,
  147,
  3,
  5,
  2,
  4,
 

In [48]:
#save data to csv file
praw_df = pd.DataFrame(praw_data)
praw_df.to_csv(".//data/scraped/praw_df_15000_2021-01-10.csv")