# EDA

### Building a predictive model to predict which subreddit a post came from

Subreddits: 'scifi' , 'Fantasy'

In [1]:
#Imports
import requests
import pandas as pd
import spacy
import time

from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_lg')

# MVP Model Features
<ul>
    <li>#1 What subreddit did the post come from?</li>
    <li>#2 What is the selftext</li>
    <li>#3 What is the title of the post</li>
</ul>

In [3]:
## Subreddit Function inspired by following: https://www.youtube.com/watch?v=AcrjEWsMi_E, further refined by Derya Gumustel
def pull_subreddit(subreddit):
    output_df = pd.DataFrame()
    interations = 10 # 2 for testing/ 25 for final data pull
    current_time = int(time.time())
    
    for i in range(interations):
        url = 'https://api.pushshift.io/reddit/search/submission/?' #Starter URL pulling just submissions
        pull_params = {"subreddit":subreddit,
                  "size":100,#Size is 5 while testing. Raise to 100 for actually pulling, once every 30 minutes
                  "before":current_time} 
    
        #Pull data
        req = requests.get(url,pull_params)
    
        #Place pulled data into Dataframe and timestamp
        pull = req.json()
        pulled_data_df = pd.DataFrame(pull['data'])
        current_time = pulled_data_df['created_utc'].min()
        
        #Derya Black-Box Code
        frames = [pulled_data_df,output_df]
        output_df = pd.concat(frames,axis=0, ignore_index=True)
        
        #Timer to prevent spamming requests
        time.sleep(180)
    return output_df    

## Fantasy EDA

In [4]:
fantasy_df = pull_subreddit('Fantasy')

In [5]:
fantasy_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_text', 'author_flair_text_color', 'awarders', 'banned_by',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'gildings', 'id', 'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'removed_by_category', 'retrieved_on', 'score', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title',
       'total_awards_received', 'treatment_tags', 'upvote_ratio', 'url',
       'whitelist_stat

### MVP Fantasy EDA

In [6]:
mvp_fantasy_df = fantasy_df[['subreddit','title','selftext']] #Constructing MVP Dataframe for Fantasy subreddit

In [7]:
mvp_fantasy_df.head()

Unnamed: 0,subreddit,title,selftext
0,Fantasy,(QUESTION) My friends and I are gonna play D&a...,
1,Fantasy,The lightning-struck heart by TJ Klune... lite...,[removed]
2,Fantasy,Best Fantasy (but also sci fi!) novel titles ?,As the title says.\n\nWhat were the best novel...
3,Fantasy,Which grim darkish fantasy book(s) are so deep...,Which books do you guys think would attract th...
4,Fantasy,My Bingo Obsession Made Manifest - Retro Bingo,My first experience with the book bingo readin...


## Sci-fi EDA

In [8]:
scifi_df = pull_subreddit('scifi')

In [9]:
mvp_scifi_df = scifi_df[['subreddit','title','selftext']] #Constructing MVP Dataframe for scifi subreddit

In [10]:
mvp_scifi_df.head(25)

Unnamed: 0,subreddit,title,selftext
0,scifi,Gina Lives to Fight Another Day!!👍❤️❤️👍,
1,scifi,Buy Google Reviews - 100% NonDrop Permanent ma...,
2,scifi,How to love yourself this Galentine's Day,[removed]
3,scifi,M-Tec Sturmvogel Gunship,
4,scifi,Anyone else hugely underwhelmed with Three Bod...,[removed]
5,scifi,New and different...,
6,scifi,How to love yourself this Galentine's Day,[removed]
7,scifi,How to love yourself this Galentine's Day,[removed]
8,scifi,The Best Movies to stream on Valentine’s Day 2...,
9,scifi,When the show is better than the books...,[removed]


## Merging Dataframes

In [12]:
mvp_df = mvp_fantasy_df.merge(mvp_scifi_df,how='outer')

In [13]:
mvp_df.to_csv('./data/dirty_mvp.csv')