# Subreddit Data Collection

I attempted to run this as many times as possible in order to scrape the maximum amount of posts from each subreddit.  Reddit only lets you scrub each subreddit once every 24 hours.  I found a way around this by scraping each subreddit's newest posts, "top" posts, and "hottest" posts.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import time

In [31]:
## based HEAVILY off of Riley Dallas' code, provided in the youtube Project 3 info session video from May 2018
## https://www.youtube.com/watch?v=5Y3ZE26Ciuk
## ALSO based on Wesley Bosse and Douglas Strodtman's 'CART - Mini-demo using reddit data' DC-Flex lesson 

# make a function that scrapes a url for data.  make sure to use a .json for url
def get_posts(subreddit, pages=40):
    
    #set initial conditions
    posts = []  #initiate a list to contain all of the posts 
    
    ### by switching between different lists within the url, its possible to scrape more data in one day
    #url = f'https://www.reddit.com/r/{subreddit}/new.json?t=all'  # scrapes from most recent posts  - 1st run
    #url = f'https://www.reddit.com/r/{subreddit}/top.json?t=all'  # scrapes from the top posts      - 2nd run
    url = f'https://www.reddit.com/r/{subreddit}/hot.json?t=all'  # scrapes from the hot posts       - 3rd run
    after = None       #initiate an after value
    
    # make a for loop to get 1000 or so posts (over a range of 40 gets about 1000)
    for i in range(pages): # run the loop 40 times
        # if hasn't been run yet, will just use default url (before any afters)
        if after == None:
            current_url = url
        # once the subreddit has been scraped initially, will check pages after the last after value    
        else:
            current_url = url + '&after=' + after

        print(current_url) # print to make sure url is being updated each scrape
        print(i)       #print how many times the loops has gone through each time it runs

        
        # Perform a get requests on subreddit
        # create a custom user agent for headers parameter in order to avoid 429 error (too many requests)
        res = requests.get(current_url, headers={'User-agent':'MichaelKnight4714'}) 
        # check to make sure not getting error before doing main quest of the code
        # 429 is error - 200 is no errors
        if res.status_code == 200:  

            # when hitting an API, calling a .json gets back a dictionary of the info pulled off our subreddit
            # save the subreddit info into the variable the_json
            the_json = res.json()
            # pull the info wanted from the json after each scrape and add to the list of posts
            current_posts = [p['data'] for p in the_json['data']['children']]
            posts.extend(current_posts)
            # reset 'after' following each scrape, so not just scraping the same posts each time
            after = the_json['data']['after']


        #if code getting an error, do a print message notification and break the for loop    
        else:
            print('ERROR')
            print(res.status_code)
            break

        time.sleep(1) #sleep for 1 second in between for loops so as not to appear to be DDoS attack to Reddit servers
    
    
    return posts

In [3]:
cheez_posts = get_posts('Cheese')

https://www.reddit.com/r/Cheese/new.json?t=all
0
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_ca7apm
1
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_c84wi5
2
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_c6qcow
3
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_c4ogf3
4
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_c36afc
5
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_c210jr
6
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_c0bo26
7
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_byemdr
8
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_bw6otn
9
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_bulb1p
10
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_bt0lfw
11
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_brl3mc
12
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_bqeen6
13
https://www.reddit.com/r/Cheese/new.json?t=all&after=t3_bnzrq6
14
https://www.reddit.com/r/Cheese/new.

In [4]:
weed_posts = get_posts('weed')

https://www.reddit.com/r/weed/new.json?t=all
0
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbx6xc
1
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbvpig
2
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbt6bp
3
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbqyii
4
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbovmn
5
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbn1ed
6
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbltsp
7
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbjnvi
8
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbgnx7
9
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbd4qt
10
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cbbnht
11
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cb9obq
12
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cb7srh
13
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cb5zwo
14
https://www.reddit.com/r/weed/new.json?t=all&after=t3_cb3e5v
15
ht

In [5]:
len(cheez_posts)

981

In [6]:
len(weed_posts)

1000

In [7]:
cheez_posts[0]['title']

'How to know what kind of exterior cover I can eat?'

In [8]:
weed_posts[0]['title']

'How every Friday night with the boys should be spent'

In [9]:
# how many unique values in the dataframe
len(set(t['name'] for t in cheez_posts))

981

In [10]:
# how many unique values in the dataframe
len(set(t['name'] for t in weed_posts))

1000

In [11]:
cheez_posts[100]['subreddit']

'Cheese'

In [12]:
# taken from Adi Bronshtein
# use a list comprehension for each of our subs. Then insert these into a dataframe, calling this columns text
def combine_text(posts):
    return[' '.join([post['title'], post['selftext']]) for post in posts] 

In [13]:
#for i in range(len(cheez_posts)):
cheese_text = combine_text(cheez_posts)
    #cheese_text = combine_text(cheez_posts[i]['data'])

In [14]:
cheese_text

["How to know what kind of exterior cover I can eat? Sorry for the dumb question but I can't never figure out what type of exterior is edible or not... I panic a bit.",
 'Homemade Wensleydale Cheese with peach chunks, aged for 3 months. Not bad for my first cheese! ',
 'Cheese. ',
 'First time trying this and it’s amazing! ',
 'Cheese in Fort Wayne I’m looking for a cheese shop or a store that has a good variety of quality cheeses in Fort Wayne. Any help is appreciated.',
 'Chees ',
 'Cheeses from a German grocery store: Cashel Blue ',
 'A taste of home. Bowland cheese, with apple sultanas and cinnamon, and crumbly Lancashire. A treat my SO brought back from up North. ',
 "Anybody know where to get the REAL French cheese in the U.S.? You know the kind I'm talking about...that sweet, unpasteurized, young, gooey cheese that I dream about. Bonus points if it's offered either online or in Southern California.",
 'Sweet vintage wine and cheese poster spotted at an estate sale ',
 'Judging b

In [15]:
#for i in range(len(weed_posts)):
weed_text = combine_text(weed_posts)

In [16]:
new_chz_df = pd.DataFrame(cheese_text, columns=['text'])
new_chz_df['cheese'] = 1

In [17]:
new_chz_df.head()

Unnamed: 0,text,cheese
0,How to know what kind of exterior cover I can ...,1
1,"Homemade Wensleydale Cheese with peach chunks,...",1
2,Cheese.,1
3,First time trying this and it’s amazing!,1
4,Cheese in Fort Wayne I’m looking for a cheese ...,1


In [18]:
weed_df = pd.DataFrame(weed_text, columns=['text'])
weed_df['cheese'] = 0

In [19]:
weed_df.head()

Unnamed: 0,text,cheese
0,How every Friday night with the boys should be...,0
1,15.2 grams in the RAWKET🚀,0
2,Any experiences with Prozac and weed? My roomm...,0
3,"I saw this photo today , and i thought it look...",0
4,Weed good alcohol less good,0


In [21]:
cheese_posts2 = get_posts('Cheese')

https://www.reddit.com/r/Cheese/top.json?t=all
0
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_6x9xu4
1
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_9eb020
2
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_a08cwk
3
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_8kln8g
4
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_6fxo5u
5
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_9fuzpx
6
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_araqbg
7
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_8owuhh
8
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_btl5gv
9
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_8t7r65
10
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_byemdr
11
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_70ssrw
12
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_aon9xt
13
https://www.reddit.com/r/Cheese/top.json?t=all&after=t3_capipk
14
https://www.reddit.com/r/Cheese/top.

In [22]:
weed_posts2 = get_posts('weed')

https://www.reddit.com/r/weed/top.json?t=all
0
https://www.reddit.com/r/weed/top.json?t=all&after=t3_bcymhh
1
https://www.reddit.com/r/weed/top.json?t=all&after=t3_afvmuq
2
https://www.reddit.com/r/weed/top.json?t=all&after=t3_bhyw44
3
https://www.reddit.com/r/weed/top.json?t=all&after=t3_asa2cf
4
https://www.reddit.com/r/weed/top.json?t=all&after=t3_avw4as
5
https://www.reddit.com/r/weed/top.json?t=all&after=t3_amd5pi
6
https://www.reddit.com/r/weed/top.json?t=all&after=t3_b5a8is
7
https://www.reddit.com/r/weed/top.json?t=all&after=t3_bcpg7l
8
https://www.reddit.com/r/weed/top.json?t=all&after=t3_bj2tzh
9
https://www.reddit.com/r/weed/top.json?t=all&after=t3_bjhgh3
10
https://www.reddit.com/r/weed/top.json?t=all&after=t3_a8um79
11
https://www.reddit.com/r/weed/top.json?t=all&after=t3_ak99qv
12
https://www.reddit.com/r/weed/top.json?t=all&after=t3_8rphhd
13
https://www.reddit.com/r/weed/top.json?t=all&after=t3_7ffeni
14
https://www.reddit.com/r/weed/top.json?t=all&after=t3_al8ora
15
ht

In [23]:
cheese_text2 = combine_text(cheese_posts2)

cheese_df2 = pd.DataFrame(cheese_text2, columns=['text'])
cheese_df2['cheese'] = 1

big_cheese_df = pd.concat([new_chz_df, cheese_df2], ignore_index=True)

In [24]:
weed_text2 = combine_text(weed_posts2)

weed_df2 = pd.DataFrame(weed_text2, columns=['text'])
weed_df2['cheese'] = 0

big_weed_df = pd.concat([weed_df, weed_df2], ignore_index=True)

In [25]:
len(big_cheese_df)

1981

In [26]:
big_cheese_df.drop_duplicates(inplace=True)

In [27]:
len(big_cheese_df)

1667

In [28]:
len(big_weed_df)

1999

In [29]:
big_weed_df.drop_duplicates(inplace=True)

In [30]:
len(big_weed_df)

1937

In [32]:
cheese_posts3 = get_posts('Cheese')

https://www.reddit.com/r/Cheese/hot.json?t=all
0
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_ca825g
1
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_c83z2s
2
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_c6zimo
3
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_c4rai1
4
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_c398p2
5
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_c216jk
6
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_c06d0t
7
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_bylhug
8
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_bw0k12
9
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_bugzna
10
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_bt8myc
11
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_brgryz
12
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_bqi9xd
13
https://www.reddit.com/r/Cheese/hot.json?t=all&after=t3_bnyuip
14
https://www.reddit.com/r/Cheese/hot.

In [33]:
weed_posts3 = get_posts('weed')

https://www.reddit.com/r/weed/hot.json?t=all
0
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbmfno
1
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbpuln
2
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbxbvx
3
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbsq09
4
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbods8
5
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cboznz
6
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbo41b
7
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbkhl5
8
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cba1vi
9
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbb70k
10
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbbufw
11
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbc6k6
12
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cbbyhy
13
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cb8p7m
14
https://www.reddit.com/r/weed/hot.json?t=all&after=t3_cattb2
15
ht

In [34]:
cheese_text3 = combine_text(cheese_posts3)

cheese_df3 = pd.DataFrame(cheese_text3, columns=['text'])
cheese_df3['cheese'] = 1

bigger_cheese_df = pd.concat([big_cheese_df, cheese_df3], ignore_index=True)

In [35]:
weed_text3 = combine_text(weed_posts3)

weed_df3 = pd.DataFrame(weed_text3, columns=['text'])
weed_df3['cheese'] = 0

bigger_weed_df = pd.concat([big_weed_df, weed_df3], ignore_index=True)

In [36]:
len(bigger_cheese_df)

2667

In [37]:
bigger_cheese_df.drop_duplicates(inplace=True)

In [38]:
len(bigger_cheese_df)

1679

In [39]:
len(bigger_weed_df)

2933

In [40]:
bigger_weed_df.drop_duplicates(inplace=True)

In [41]:
len(bigger_weed_df)

1939

In [42]:
#read old_cheese_posts.csv into 'old_cheese'
old_cheese = pd.read_csv('./datasets/old_cheese_posts.csv')

In [59]:
old_cheese.head()

Unnamed: 0,text,cheese
0,"FAQ: What is cheese, anyway?",1
1,Cheese plate I made yesterday for a friend and I,1
2,A selection of goat cheeses this evening with ...,1
3,delice de poitou,1
4,Cheese “dessert” plate: whipped chèvre in chou...,1


In [44]:
biggest_cheese_df = pd.concat([old_cheese, bigger_cheese_df], ignore_index=True)

In [45]:
len(biggest_cheese_df)

2668

In [46]:
biggest_cheese_df.drop_duplicates(inplace=True)

In [47]:
len(biggest_cheese_df)

1693

In [48]:
# save our cheese subreddit dataframe to the file new_cheese_posts.csv
# mode='a' to append the new df to the old ones, not overwrite
biggest_cheese_df.to_csv('./datasets/new_cheese_posts.csv', mode='a', index=False)

In [49]:
# save our weed subreddit dataframe to the file new_weed_posts.csv
# mode='a' to append the new df to the old ones, not overwrite
bigger_weed_df.to_csv('./datasets/new_weed_posts.csv', mode='a', index=False)

In [50]:
cheese_and_weed = pd.concat([biggest_cheese_df, bigger_weed_df], ignore_index=True)

In [51]:
cheese_and_weed.head()

Unnamed: 0,text,cheese
0,"FAQ: What is cheese, anyway?",1
1,Cheese plate I made yesterday for a friend and I,1
2,A selection of goat cheeses this evening with ...,1
3,delice de poitou,1
4,Cheese “dessert” plate: whipped chèvre in chou...,1


In [52]:
cheese_and_weed.tail()

Unnamed: 0,text,cheese
3627,If smoking marijuana causes short-term memory ...,0
3628,Those damn edibles,0
3629,Allergies,0
3630,Join the official /r/weed Discord server!,0
3631,Anyone travel with weed in their checked lugga...,0


In [53]:
cheese_and_weed['cheese'].value_counts()

0    1939
1    1693
Name: cheese, dtype: int64

In [54]:
cheese_and_weed['cheese'].value_counts(normalize=True)

0    0.533866
1    0.466134
Name: cheese, dtype: float64

In [55]:
len(cheese_and_weed)

3632

In [56]:
cheese_and_weed.drop_duplicates(inplace=True)

In [57]:
len(cheese_and_weed)

3632

In [58]:
# save our 2 combined subreddits as one dataframe to the file combined.csv
# mode='a' to append the new df to the old ones, not overwrite
cheese_and_weed.to_csv('./datasets/combined.csv', mode='a', index=False)