In [1]:
# Import libaries
import pandas as pd
import numpy as np
import requests
import datetime as dt
import time
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns

from string import ascii_uppercase
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from nltk import word_tokenize, ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin



## Web Scraping/Data Collection

We define a function below to engage with the reddit website and pull the content based on subreddit topics that we will feed in. We are requesting content from the last 150 days to gather a large enough dataset to generate significant results. We are also delaying each iteration by 2 seconds to ensure we do not overload the server. 

In [2]:
# we're asking pushshift to engage with redit website
def query_pushshift(subreddit, kind='submission', skip=30, times=5, # we are pulling 30 days of posts 5 times = 150 days
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 
                                'score', 'is_self'], #
                    comfields = ['body', 'score', 'created_utc']):
    stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size=500".format(kind, subreddit)
    
    mylist = []
    
    for x in range(1, times + 1):
        
        URL = "{}&after={}d".format(stem, skip * x)
        print(URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        mylist.append(df)
        time.sleep(2) # We will wait 2 seconds each iteration to prevent overloading the server
        
    full = pd.concat(mylist, sort=False)
    
    if kind == "submission":
        
        full = full[subfield]
        
        full = full.drop_duplicates()
        
        full = full.loc[full['is_self'] == True]
        
    def get_date(created):
        return dt.date.fromtimestamp(created)
    
    _timestamp = full["created_utc"].apply(get_date)
    
    full['timestamp'] = _timestamp
    print(full.shape)
    
    return full 

We are running our query push shift functions on our 2 reddit topics below, directly related to our problem statement.

In [3]:
sub_1_query = query_pushshift('teslamotors')

https://api.pushshift.io/reddit/search/submission/?subreddit=teslamotors&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=teslamotors&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=teslamotors&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=teslamotors&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=teslamotors&size=500&after=150d
(1368, 9)


In [4]:
sub_2_query = query_pushshift('cars')

https://api.pushshift.io/reddit/search/submission/?subreddit=cars&size=500&after=30d
https://api.pushshift.io/reddit/search/submission/?subreddit=cars&size=500&after=60d
https://api.pushshift.io/reddit/search/submission/?subreddit=cars&size=500&after=90d
https://api.pushshift.io/reddit/search/submission/?subreddit=cars&size=500&after=120d
https://api.pushshift.io/reddit/search/submission/?subreddit=cars&size=500&after=150d
(1671, 9)


We are combining the content from both subreddits into a single dataframe below

In [5]:
combined_sub_queries = pd.concat([sub_1_query, sub_2_query])

In [7]:
combined_sub_queries.to_csv('../Data/subreddit_data.csv')