# Data Retrieval & Cleaning
### This notebook will download, clean and save the data to a CSV file.

In [1]:
import numpy as np
import pandas as pd
import requests

from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [2]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [4]:
# Since pushshift only limit 100 post per request,
# A loop is needed to get more data
# First, The function divide 2 year timespan into 25 equal time interval
# Then, the function pull 100 most commented post from each time interval
# Which 
def get_df(subreddit,after,before):
    
    params = {
    'subreddit': subreddit,
    'size': 100,
    'after':after,
    'before':before,
    'sort_type': 'num_comments',
    'sort': 'desc'
    }

    res = requests.get(url,params)
    data = res.json()
    posts = data['data']

    df = pd.DataFrame(posts)
    df = df[['subreddit','title']]
    return df

n = 25 # number of post n*100 per subreddit
day = np.linspace(0,730,n+1,dtype=int)
df_all = []
for t in ['News','TheOnion']:
    for i in range(n):
        df_all.append(get_df(t,f'{day[i+1]}d',f'{day[i]}d'))
        
# Joint all the df together, map the subreddit name. 
df_all = pd.concat(df_all,axis=0)
df_all.reset_index(drop=True,inplace=True)
df_all.rename(columns={'subreddit':'real'},inplace=True)
df_all['real'] = df_all['real'].map({'news':1,'TheOnion':0})

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
df_all.shape


## Data Cleaning
This step involve remove stop words, change text to lower case, remove punctuation and bad symbol.

In [None]:
# Inspiration: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
# punctuation remove code source: https://www.codegrepper.com/code-examples/python/replace+punctuation+in+string+python

def text_clean(text):
    # Turn to lowercase
    text = text.lower()
    
    # Remove punctuations
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    new_text = ''
    for char in text:
        if char not in punctuations:
            new_text += char
    
    # Remove stopwords
    all_stop_words = set(stopwords.words('english'))
    new_text = ' '.join(word for word in new_text.split() if word not in all_stop_words)
    return new_text

In [None]:
df_all['title'] = df_all['title'].apply(text_clean)

In [None]:
df_all

In [None]:
df_all.to_csv('./data/reddit_df.csv',index=False)