# EDA

### Building a predictive model to predict which subreddit a post came from

Subreddits: 'scifi' , 'Fantasy'

In [8]:
#Imports
import spacy
from spacy import displacy
import pandas as pd
import numpy as np

In [4]:
nlp = spacy.load('en_core_web_lg')

# MVP Model Features
<ul>
    <li>#1 What subreddit did the post come from?</li>
    <li>#2 What is the selftext</li>
    <li>#3 What is the title of the post</li>
</ul>

## Fantasy EDA

In [5]:
fantasy_df = pd.read_csv('./data/dirty_fantasy.csv')
fantasy_df.columns

Index(['Unnamed: 0', 'all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_received',
       'treatment_tags', '

### Creating MVP Fantasy Dataframe

In [4]:
mvp_fantasy_df = fantasy_df[['subreddit','title','selftext']] #Constructing MVP Dataframe for Fantasy subreddit
mvp_fantasy_df.head()

Unnamed: 0,subreddit,title,selftext
0,Fantasy,Looking for contained grimdark stories,"I love grimdark stories, but am also starting ..."
1,Fantasy,Pariahs my new novel!,
2,Fantasy,The Mask Falling SPOILERS,Ok can we discuss the fourth book in Samantha ...
3,Fantasy,GREAT MAGIC SYSTEM WITH A TWIST,[removed]
4,Fantasy,AMAZING MAGIC SYSTEM WITH A TWIST,


In [5]:
mvp_fantasy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  3000 non-null   object
 1   title      3000 non-null   object
 2   selftext   2475 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


In [6]:
#Creating feature to track whether or not the existence of selftext is impactful enough to help the model determine origin
mvp_fantasy_df['has_selftext'] = mvp_fantasy_df['selftext'].notnull()
mvp_fantasy_df['has_selftext'] = mvp_fantasy_df['has_selftext'].map({True:1,False:0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mvp_fantasy_df['has_selftext'] = mvp_fantasy_df['selftext'].notnull()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mvp_fantasy_df['has_selftext'] = mvp_fantasy_df['has_selftext'].map({True:1,False:0})


In [7]:
mvp_fantasy_df['has_selftext'].mean() #82.5% of fantasy posts have selftext

0.825

### Fantasy Data Preprocessing

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
#Creating simplified fantasy Dataframe
toy_df = fantasy_df[['subreddit','title','selftext']]
for row in toy_df['title']:
    for token in nlp(row):
        print(token.text, token.pos_, token.is_stop)

## Sci-fi EDA

In [8]:
scifi_df = pd.read_csv('./data/dirty_scifi.csv')
scifi_df.columns

Index(['Unnamed: 0', 'all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'removed_by_category', 'retrieved_on', 'score', 'selftext',
       'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title',
       'total_awards_receive

In [32]:
scifi_df['media'].value_counts().sum()

416

### Creating MVP Sci-fi Dataframe

In [9]:
mvp_scifi_df = scifi_df[['subreddit','title','selftext']] #Constructing MVP Dataframe for scifi subreddit
mvp_scifi_df.head() 

Unnamed: 0,subreddit,title,selftext
0,scifi,Any novels or short stories about a civilizati...,[removed]
1,scifi,‘Alien’ Series In The Works At FX With Noah Ha...,
2,scifi,With release of cyberpunk 2077 i am here to re...,
3,scifi,"City invasion (Milan, ITALY). Just for fun! [OC]",
4,scifi,Round Earth Theory - Monoliths,


In [10]:
mvp_scifi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  3000 non-null   object
 1   title      3000 non-null   object
 2   selftext   1628 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


In [19]:
#Creating feature to track whether or not the existence of selftext is impactful enough to help the model determine origin
mvp_scifi_df['has_selftext'] = mvp_scifi_df['selftext'].notnull()
mvp_scifi_df['has_selftext'] = mvp_scifi_df['has_selftext'].map({True:1,False:0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mvp_scifi_df['has_selftext'] = mvp_scifi_df['selftext'].notnull()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mvp_scifi_df['has_selftext'] = mvp_scifi_df['has_selftext'].map({True:1,False:0})


In [12]:
mvp_scifi_df['has_selftext'].unique()

array([1, 0], dtype=int64)

In [13]:
mvp_scifi_df.head()

Unnamed: 0,subreddit,title,selftext,has_selftext
0,scifi,Any novels or short stories about a civilizati...,[removed],1
1,scifi,‘Alien’ Series In The Works At FX With Noah Ha...,,0
2,scifi,With release of cyberpunk 2077 i am here to re...,,0
3,scifi,"City invasion (Milan, ITALY). Just for fun! [OC]",,0
4,scifi,Round Earth Theory - Monoliths,,0


In [14]:
mvp_scifi_df['has_selftext'].mean() # 54.3% of scifi posts have selftext

0.5426666666666666

### Merging MVP Dataframes

In [15]:
mvp_df = mvp_fantasy_df.merge(mvp_scifi_df,how='outer')
mvp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6000 entries, 0 to 5999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subreddit     6000 non-null   object
 1   title         6000 non-null   object
 2   selftext      4103 non-null   object
 3   has_selftext  6000 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 234.4+ KB


In [16]:
mvp_df.to_csv('./data/mvp.csv') #Saving merged data to mvp.csv

## Merged Data EDA

In [17]:
mvp_df['title'].duplicated().sum() #Checking for duplicate posts

246

Because 246 duplicate posts isn't an alarming number, I am going to keep them in the dataset. I should mark them in a future pass to see if duplicates can be used to judge origin of post.

In [18]:
mvp_df.head()

Unnamed: 0,subreddit,title,selftext,has_selftext
0,Fantasy,Looking for contained grimdark stories,"I love grimdark stories, but am also starting ...",1
1,Fantasy,Pariahs my new novel!,,0
2,Fantasy,The Mask Falling SPOILERS,Ok can we discuss the fourth book in Samantha ...,1
3,Fantasy,GREAT MAGIC SYSTEM WITH A TWIST,[removed],1
4,Fantasy,AMAZING MAGIC SYSTEM WITH A TWIST,,0
