# Goal of this notebook:
    
* Use Pushshift API to collect 2 subreddits category: **travel**  and **makeup**
* Save the collected data into csv files

In [1]:
import requests
import pandas as pd
import time

# Data Scraping
we will create a python function that will consume the pushshift api to grab data from reddit.

In [2]:
def scrap_reddit(subreddit):
    # set the base url that we will send http request to
    url = 'https://api.pushshift.io/reddit/search/submission'
    # prepare the requets payload
    params = {
        'subreddit': subreddit,
        'size' : 100,
        'before': None
    }

    df_buffer = []
    for i in range(15):
        # send the http request to scrape the data
        results = requests.get(url,params)
        data = results.json()['data']

        # process the json object and create a new dataframe
        df_tmp = pd.DataFrame(data)
        df_buffer.append(df_tmp)
        try:
          params['before'] = df_tmp['created_utc'][99]
          df = pd.concat(df_buffer, ignore_index = True)
          df.to_csv(f'{subreddit}.csv')
          time.sleep(10)
          print(f'Iterations {i+1} completed..')
        except:
          print('there was an exception with the scraping process..')


# Scraping Travle Subreddit

In [3]:
# call the built function we've created above
scrap_reddit('travel')

Iterations 1 completed..
Iterations 2 completed..
Iterations 3 completed..
Iterations 4 completed..
Iterations 5 completed..
Iterations 6 completed..
Iterations 7 completed..
Iterations 8 completed..
Iterations 9 completed..
Iterations 10 completed..
Iterations 11 completed..
Iterations 12 completed..
Iterations 13 completed..
Iterations 14 completed..
Iterations 15 completed..


# Scraping Makeup subreddit

In [4]:
# call the built function we've created above
scrap_reddit('makeup')

Iterations 1 completed..
Iterations 2 completed..
Iterations 3 completed..
Iterations 4 completed..
Iterations 5 completed..
Iterations 6 completed..
Iterations 7 completed..
Iterations 8 completed..
Iterations 9 completed..
Iterations 10 completed..
Iterations 11 completed..
Iterations 12 completed..
Iterations 13 completed..
Iterations 14 completed..
Iterations 15 completed..


## Checking the data we have scraped

In [5]:
travel_df = pd.read_csv("travel.csv")
makeup_df = pd.read_csv("makeup.csv")

### Travel subreddit

In [6]:
travel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 86 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     1500 non-null   int64  
 1   all_awardings                  1500 non-null   object 
 2   allow_live_comments            1500 non-null   bool   
 3   author                         1500 non-null   object 
 4   author_flair_css_class         27 non-null     object 
 5   author_flair_richtext          1492 non-null   object 
 6   author_flair_text              27 non-null     object 
 7   author_flair_type              1492 non-null   object 
 8   author_fullname                1492 non-null   object 
 9   author_is_blocked              1500 non-null   bool   
 10  author_patreon_flair           1492 non-null   object 
 11  author_premium                 1492 non-null   object 
 12  awarders                       1500 non-null   o

In [7]:
print(f"we have exactly {len(travel_df)} rows and {travel_df.shape[1]} column in our dataframe")

we have exactly 1500 rows and 86 column in our dataframe


### Makeup subreddit

In [8]:
makeup_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 73 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   1500 non-null   int64  
 1   all_awardings                1500 non-null   object 
 2   allow_live_comments          1500 non-null   bool   
 3   author                       1500 non-null   object 
 4   author_flair_css_class       0 non-null      float64
 5   author_flair_richtext        1500 non-null   object 
 6   author_flair_text            0 non-null      float64
 7   author_flair_type            1500 non-null   object 
 8   author_fullname              1500 non-null   object 
 9   author_is_blocked            1500 non-null   bool   
 10  author_patreon_flair         1500 non-null   bool   
 11  author_premium               1500 non-null   bool   
 12  awarders                     1500 non-null   object 
 13  can_mod_post      

In [9]:
print(f"we have exactly {len(makeup_df)} rows and {makeup_df.shape[1]} column in our dataframe")

we have exactly 1500 rows and 73 column in our dataframe
