# Project 3: Reddit Web Scraping File

### Import Libraries

In [1]:
import requests
import pandas as pd
import time
import random
pd.set_option('display.max_columns', None)

### Identify the 2 Subreddits for analysis

In [2]:
# Declare the URLs for democrats and republican data

dem_url = 'https://www.reddit.com/r/democrats.json'
rep_url = 'https://www.reddit.com/r/republican.json'

### Create custom function to perform Reddit scrape

<font color=blue> A custom function is created to scrape based on the :
    <ol>
<li>Web address provided</li> 
<li>Number of pages intended for scraping</li>
    </ol>
    
A random interval of between 2-30 seconds of rest will be applied to prevent API detecting the script as malicious.
</font>

In [3]:
# Create custom function where you input URL and number of scrapes

posts = []

def web_scraper(url,scrape_count):
    after = None
    count = int(scrape_count) 
    
    for i in range(count):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after + '&limit=50'
        print(current_url)
        res = requests.get(current_url, headers={'User-agent': 'lphong'})
            
        if res.status_code != 200:
            print('Status error', res.status_code)
            break  
                
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']
    
        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,30)
        print(sleep_duration)
        time.sleep(sleep_duration)

### <font color=red>Scrape Democrats Subreddit</font>

In [4]:
# Feed the subreddit URL and scrape for 20 x 50 posts (Reddit's 1,000 post limit)

web_scraper(dem_url,'21')

https://www.reddit.com/r/democrats.json
21
https://www.reddit.com/r/democrats.json?after=t3_hetkt8&limit=50
28
https://www.reddit.com/r/democrats.json?after=t3_heq3fc&limit=50
18
https://www.reddit.com/r/democrats.json?after=t3_hddclw&limit=50
19
https://www.reddit.com/r/democrats.json?after=t3_hcud4a&limit=50
3
https://www.reddit.com/r/democrats.json?after=t3_hcos8m&limit=50
17
https://www.reddit.com/r/democrats.json?after=t3_hboveu&limit=50
2
https://www.reddit.com/r/democrats.json?after=t3_hbicgh&limit=50
15
https://www.reddit.com/r/democrats.json?after=t3_h9qke4&limit=50
16
https://www.reddit.com/r/democrats.json?after=t3_h9e6ac&limit=50
18
https://www.reddit.com/r/democrats.json?after=t3_h8k91q&limit=50
25
https://www.reddit.com/r/democrats.json?after=t3_h7ktwz&limit=50
9
https://www.reddit.com/r/democrats.json?after=t3_h7i7x3&limit=50
10
https://www.reddit.com/r/democrats.json?after=t3_h0i2o9&limit=50
7
https://www.reddit.com/r/democrats.json?after=t3_gzwx3u&limit=50
7
https://ww

In [5]:
# Load information into a dataframe and reset post for next pull

dem_data = pd.DataFrame(posts)
posts = []

In [6]:
# Check the number of posts scraped and size up data

dem_data.shape

(990, 112)

In [7]:
# Inspect the dataframe to identify the relevant fields for further extraction and analysis

dem_data.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,thumbnail_height,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,subreddit_type,ups,total_awards_received,media_embed,thumbnail_width,author_flair_template_id,is_original_content,user_reports,secure_media,is_reddit_media_domain,is_meta,category,secure_media_embed,link_flair_text,can_mod_post,score,approved_by,author_premium,thumbnail,edited,author_flair_css_class,author_flair_richtext,gildings,post_hint,content_categories,is_self,mod_note,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,url_overridden_by_dest,view_count,archived,no_follow,is_crosspostable,pinned,over_18,preview,all_awardings,awarders,media_only,can_gild,spoiler,locked,author_flair_text,treatment_tags,visited,removed_by,num_reports,distinguished,subreddit_id,mod_reason_by,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,mod_reports,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,media_metadata,crosspost_parent_list,crosspost_parent,author_cakeday,link_flair_template_id
0,,democrats,,t2_tkz7y,False,,0,False,Hey All! I made a documentary about Joe Biden....,[],r/democrats,False,6,,0,105.0,,False,t3_hdge6q,False,dark,0.71,,public,25,1,"{'content': '&lt;iframe width=""600"" height=""33...",140.0,,False,[],"{'type': 'youtube.com', 'oembed': {'provider_u...",False,False,,"{'content': '&lt;iframe width=""600"" height=""33...",,False,25,,False,https://b.thumbs.redditmedia.com/wqcy2fPEQRSUE...,False,,[],{'gid_1': 1},rich:video,,False,,1592809000.0,text,6,,,text,youtu.be,False,,,,,https://youtu.be/5JRgue60YBo,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,"[{'giver_coin_reward': None, 'subreddit_id': N...",[],False,False,False,False,,[],False,,,,t5_2qn70,,,,hdge6q,True,,D1Wheeler,,26,True,all_ads,False,[],False,,/r/democrats/comments/hdge6q/hey_all_i_made_a_...,all_ads,True,https://youtu.be/5JRgue60YBo,133638,1592780000.0,0,"{'type': 'youtube.com', 'oembed': {'provider_u...",False,,,,,
1,,democrats,,t2_713c2cph,False,,0,False,It’s Time for a Blue Wave to Restore America.,[],r/democrats,False,6,,0,140.0,,False,t3_hff6bt,False,dark,0.93,,public,837,0,{},140.0,,False,[],,False,False,,{},,False,837,,False,https://b.thumbs.redditmedia.com/VsiL8kIa1Np1k...,False,,[],{},image,,False,,1593084000.0,text,6,,,text,i.imgur.com,False,,,,,https://i.imgur.com/AHmH7MP.jpg,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qn70,,,,hff6bt,True,,SofaKingVote,,127,True,all_ads,False,[],False,,/r/democrats/comments/hff6bt/its_time_for_a_bl...,all_ads,False,https://i.imgur.com/AHmH7MP.jpg,133638,1593055000.0,0,,False,,,,,
2,,democrats,,t2_35r4k0nu,False,,0,False,"In Scathing Letter, More Than 80-Percent of Fa...",[],r/democrats,False,6,,0,73.0,,False,t3_hf7pni,False,dark,0.97,,public,1354,0,{},140.0,,False,[],,False,False,,{},,False,1354,,True,https://b.thumbs.redditmedia.com/XrkmHSSYm4cej...,False,,[],{},link,,False,,1593056000.0,text,6,,,text,lawandcrime.com,True,,,,,https://lawandcrime.com/high-profile/in-scathi...,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qn70,,,,hf7pni,True,,paone22,,31,False,all_ads,False,[],False,,/r/democrats/comments/hf7pni/in_scathing_lette...,all_ads,False,https://lawandcrime.com/high-profile/in-scathi...,133638,1593027000.0,1,,False,,,,,
3,,democrats,,t2_y4w5p,False,,0,False,Mark Cuban endorses Biden on Hannity: He 'actu...,[],r/democrats,False,6,blue,0,78.0,,False,t3_hfgpv1,False,dark,0.97,,public,101,0,{},140.0,,False,[],,False,False,,{},article,False,101,,False,https://b.thumbs.redditmedia.com/sT75S66vdbdzC...,False,,[],{},link,,False,,1593093000.0,text,6,,,text,thehill.com,False,,,,,https://thehill.com/homenews/media/504264-mark...,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qn70,,,,hfgpv1,True,,realplayer16,,4,True,all_ads,False,[],False,,/r/democrats/comments/hfgpv1/mark_cuban_endors...,all_ads,False,https://thehill.com/homenews/media/504264-mark...,133638,1593064000.0,0,,False,,,,,
4,,democrats,,t2_1woh,False,,0,False,Biden Surge and Trump Failures Expand Electora...,[],r/democrats,False,6,,0,104.0,,True,t3_hfmomx,False,dark,1.0,,public,8,0,{},140.0,,False,[],,False,False,,{},,False,8,,False,https://b.thumbs.redditmedia.com/C6qMxA1N4-wzs...,False,,[],{},link,,False,,1593123000.0,text,6,,,text,ncec.org,False,,,,,http://www.ncec.org/analysis/biden-surge-trump...,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qn70,,,,hfmomx,True,,Lomag,,0,True,all_ads,False,[],False,,/r/democrats/comments/hfmomx/biden_surge_and_t...,all_ads,False,http://www.ncec.org/analysis/biden-surge-trump...,133638,1593094000.0,0,,False,,,,,


### Save and export raw uncleaned file before treatment

In [8]:
dem_data.to_csv('../democrat_uncleaned.csv', index=False, encoding='utf-8-sig')

In [9]:
# Load the uncleaned file into a new dataframe for cleaning

democrats = pd.read_csv('../democrat_uncleaned.csv')

In [10]:
# The ones with discussion text for analysis are mainly title, selftext and comments. (comments excluded due to volume)
# "Distinguished" will be pulled as well to identify moderator posts and see if they should be kept

cols_to_keep = ['subreddit','title','selftext','distinguished']
democrats = democrats.loc[:, cols_to_keep]

In [11]:
# Verify that all data was carried over

democrats.shape

(990, 4)

In [12]:
democrats.head()

Unnamed: 0,subreddit,title,selftext,distinguished
0,democrats,Hey All! I made a documentary about Joe Biden....,,
1,democrats,It’s Time for a Blue Wave to Restore America.,,
2,democrats,"In Scathing Letter, More Than 80-Percent of Fa...",,
3,democrats,Mark Cuban endorses Biden on Hannity: He 'actu...,,
4,democrats,Biden Surge and Trump Failures Expand Electora...,,


In [13]:
# Check for duplicates

len(democrats['title'].unique())

979

In [14]:
# Remove duplicates

democrats.drop_duplicates(subset='title', keep="first", inplace = True)

In [15]:
# Remove moderator comments as it is deemed to not be representative of organic post content

democrats = democrats[democrats['distinguished']!='moderator']

In [16]:
# Drop the "distinguised" as it no longer serves any purpose

democrats.drop('distinguished',axis=1,inplace=True)

In [17]:
# Check that column has been dropped

democrats.head()

Unnamed: 0,subreddit,title,selftext
0,democrats,Hey All! I made a documentary about Joe Biden....,
1,democrats,It’s Time for a Blue Wave to Restore America.,
2,democrats,"In Scathing Letter, More Than 80-Percent of Fa...",
3,democrats,Mark Cuban endorses Biden on Hannity: He 'actu...,
4,democrats,Biden Surge and Trump Failures Expand Electora...,


In [18]:
democrats.shape

(975, 3)

### Check for null values

In [19]:
# Check that column has been dropped

democrats.isnull().sum()

subreddit      0
title          0
selftext     860
dtype: int64

<font color=blue>It observed that out of 975 entries for 'selftext' column, 860 are blank. We shall let this remain and scrape the other reddit dataset for further inspection. A decision on this feature will be made after both datasets are combined.</font>

### <font color=red>Scrape Republicans Subreddit</font>

In [20]:
# Feed the subreddit URL and scrape for 20 x 50 posts (Reddit's 1,000 post limit)

web_scraper(rep_url,'21')

https://www.reddit.com/r/republican.json
12
https://www.reddit.com/r/republican.json?after=t3_hf0qsa&limit=50
27
https://www.reddit.com/r/republican.json?after=t3_hf1aiu&limit=50
9
https://www.reddit.com/r/republican.json?after=t3_he27v5&limit=50
17
https://www.reddit.com/r/republican.json?after=t3_hdsp8t&limit=50
3
https://www.reddit.com/r/republican.json?after=t3_hdhno7&limit=50
27
https://www.reddit.com/r/republican.json?after=t3_hcmpe8&limit=50
18
https://www.reddit.com/r/republican.json?after=t3_hcl5o3&limit=50
27
https://www.reddit.com/r/republican.json?after=t3_hceodb&limit=50
24
https://www.reddit.com/r/republican.json?after=t3_hboxo8&limit=50
17
https://www.reddit.com/r/republican.json?after=t3_hat4au&limit=50
4
https://www.reddit.com/r/republican.json?after=t3_h9mohc&limit=50
15
https://www.reddit.com/r/republican.json?after=t3_h9gvnb&limit=50
10
https://www.reddit.com/r/republican.json?after=t3_h85qpd&limit=50
30
https://www.reddit.com/r/republican.json?after=t3_h7tdvi&limit

In [21]:
# Load information into a dataframe and reset post for next pull

rep_data = pd.DataFrame(posts)
posts = []

In [22]:
# Check the number of posts scraped and size up data

rep_data.shape

(992, 110)

In [23]:
# Inspect the dataframe for routine check

rep_data.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,thumbnail_height,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,subreddit_type,ups,total_awards_received,media_embed,thumbnail_width,author_flair_template_id,is_original_content,user_reports,secure_media,is_reddit_media_domain,is_meta,category,secure_media_embed,link_flair_text,can_mod_post,score,approved_by,author_premium,thumbnail,edited,author_flair_css_class,author_flair_richtext,gildings,post_hint,content_categories,is_self,mod_note,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,url_overridden_by_dest,view_count,archived,no_follow,is_crosspostable,pinned,over_18,preview,all_awardings,awarders,media_only,can_gild,spoiler,locked,author_flair_text,treatment_tags,visited,removed_by,num_reports,distinguished,subreddit_id,mod_reason_by,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,mod_reports,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,crosspost_parent_list,crosspost_parent,link_flair_template_id
0,,Republican,,t2_4xt0mqzz,False,,0,False,New evidence shows it is time to charge Joe Bi...,[],r/Republican,False,6,,0,93.0,,True,t3_hflqaf,False,dark,0.75,,public,69,0,{},140.0,,False,[],,False,False,,{},,False,69,,False,https://b.thumbs.redditmedia.com/YWLrkXs7K0ijp...,False,,[],{},link,,False,,1593119000.0,text,6,,,text,thelibertyloft.com,False,,,,,https://thelibertyloft.com/new-evidence-shows-...,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qndt,,,,hflqaf,True,,TheLibertyLoft,,7,True,all_ads,False,[],False,,/r/Republican/comments/hflqaf/new_evidence_sho...,all_ads,False,https://thelibertyloft.com/new-evidence-shows-...,121315,1593090000.0,0,,False,,,
1,,Republican,,t2_1s4usext,False,,0,False,Just putting this out there.,[],r/Republican,False,6,,0,140.0,,False,t3_hf616r,False,dark,0.73,,public,2477,0,{},140.0,,False,[],,True,False,,{},,False,2477,,False,https://b.thumbs.redditmedia.com/L7h7nzx3A79W-...,False,,[],{},image,,False,,1593051000.0,text,6,,,text,i.redd.it,True,,,,,https://i.redd.it/ztk2kstyew651.jpg,,False,False,False,False,False,{'images': [{'source': {'url': 'https://previe...,[],[],False,False,False,False,,[],False,,,,t5_2qndt,,,,hf616r,True,,toothfinder,,259,True,all_ads,False,[],False,,/r/Republican/comments/hf616r/just_putting_thi...,all_ads,False,https://i.redd.it/ztk2kstyew651.jpg,121315,1593022000.0,3,,False,,,
2,,Republican,,t2_4ywyqqu1,False,,0,False,Welp...looks like EA and Battlefield joined th...,[],r/Republican,False,6,,0,140.0,,True,t3_hfmodl,False,dark,0.89,,public,45,0,{},140.0,,False,[],,True,False,,{},,False,45,,False,https://b.thumbs.redditmedia.com/ZvqI-0S0x1ILu...,False,,[],{},image,,False,,1593123000.0,text,6,,,text,i.redd.it,False,,,,,https://i.redd.it/shl8y03rc2751.jpg,,False,False,False,False,False,{'images': [{'source': {'url': 'https://previe...,[],[],False,False,False,False,,[],False,,,,t5_2qndt,,,,hfmodl,True,,hotpieismyking,,18,True,all_ads,False,[],False,,/r/Republican/comments/hfmodl/welplooks_like_e...,all_ads,False,https://i.redd.it/shl8y03rc2751.jpg,121315,1593094000.0,0,,False,,,
3,,Republican,,t2_tpsej,False,,0,False,BET Founder says 'black people laugh at white ...,[],r/Republican,False,6,,0,73.0,,True,t3_hfmbjo,False,dark,0.97,,public,30,0,{},140.0,,False,[],,False,False,,{},,False,30,,True,https://b.thumbs.redditmedia.com/No4zBrB4sRgFq...,False,,[],{},link,,False,,1593121000.0,text,6,,,text,justthenews.com,False,,,,,https://justthenews.com/nation/culture/bet-fou...,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qndt,,,,hfmbjo,True,,Foubar,,6,True,all_ads,False,[],False,,/r/Republican/comments/hfmbjo/bet_founder_says...,all_ads,False,https://justthenews.com/nation/culture/bet-fou...,121315,1593093000.0,0,,False,,,
4,,Republican,,t2_tpsej,False,,0,False,Explosive New FBI Notes Confirm Obama Directed...,[],r/Republican,False,6,,0,97.0,,True,t3_hfm3ky,False,dark,0.82,,public,18,0,{},140.0,,False,[],,False,False,,{},,False,18,,True,https://b.thumbs.redditmedia.com/r3aXfoWiqsRrA...,False,,[],{},link,,False,,1593121000.0,text,6,,,text,thefederalist.com,False,,,,,https://thefederalist.com/2020/06/24/explosive...,,False,False,False,False,False,{'images': [{'source': {'url': 'https://extern...,[],[],False,False,False,False,,[],False,,,,t5_2qndt,,,,hfm3ky,True,,Foubar,,1,True,all_ads,False,[],False,,/r/Republican/comments/hfm3ky/explosive_new_fb...,all_ads,False,https://thefederalist.com/2020/06/24/explosive...,121315,1593092000.0,0,,False,,,


### Save and export raw uncleaned file before treatment

In [24]:
rep_data.to_csv('../republican_uncleaned.csv', index=False, encoding='utf-8-sig')

In [25]:
# Load the uncleaned file into a new dataframe for cleaning

republicans = pd.read_csv('../republican_uncleaned.csv')

In [26]:
# The ones with discussion text for analysis are mainly title, selftext and comments. (comments excluded due to volume)
# "Distinguished" will be pulled as well to identify moderator posts and see if they should be kept

cols_to_keep = ['subreddit','title','selftext','distinguished']
republicans = republicans.loc[:, cols_to_keep]

In [27]:
# Verify that all data was carried over

republicans.shape

(992, 4)

In [28]:
republicans.head()

Unnamed: 0,subreddit,title,selftext,distinguished
0,Republican,New evidence shows it is time to charge Joe Bi...,,
1,Republican,Just putting this out there.,,
2,Republican,Welp...looks like EA and Battlefield joined th...,,
3,Republican,BET Founder says 'black people laugh at white ...,,
4,Republican,Explosive New FBI Notes Confirm Obama Directed...,,


In [29]:
# Check for duplicates

len(republicans['title'].unique())

810

In [30]:
# Remove duplicates

republicans.drop_duplicates(subset='title', keep="first", inplace = True)

In [31]:
# Remove moderator comments as it is deemed to not be representative of organic post content

republicans = republicans[republicans['distinguished']!='moderator']

  res_values = method(rvalues)


In [32]:
# Drop the "distinguised" as it no longer serves any purpose

republicans.drop('distinguished',axis=1,inplace=True)

In [33]:
# Check that column has been dropped

republicans.head()

Unnamed: 0,subreddit,title,selftext
0,Republican,New evidence shows it is time to charge Joe Bi...,
1,Republican,Just putting this out there.,
2,Republican,Welp...looks like EA and Battlefield joined th...,
3,Republican,BET Founder says 'black people laugh at white ...,
4,Republican,Explosive New FBI Notes Confirm Obama Directed...,


In [34]:
republicans.shape

(810, 3)

### Check for null values

In [35]:
# Check that column has been dropped

republicans.isnull().sum()

subreddit      0
title          0
selftext     767
dtype: int64

<font color=blue>It observed that out of 810 entries for 'selftext' column, 767 are blank. We shall let this remain and a decision on this feature will be made after both datasets are combined.</font>

### Balance Post Counts

<font color=blue>The number of Democrat data points of 975 are much larger than the Republican's 810. In order get an even accuracy, data points will be dropped from Democrats set to match the number of the Republicans.</font>

In [36]:
# Drop the needed number of rows

democrats = democrats.iloc[:-165]

In [37]:
# Ensure that the number of democrat subreddit now matches republican

democrats.shape

(810, 3)

### Save each of the individual Subreddit  Data

In [39]:
democrats.to_csv('../democrats.csv', index=False, encoding='utf-8-sig')
republicans.to_csv('../republicans.csv', index=False, encoding='utf-8-sig')

### Merge both Subreddit Data Sets

In [40]:
# Combine both subreddit data

combined_df = pd.concat([democrats, republicans], ignore_index=True)

In [41]:
# Check that it has been correctly merged. Top half will belong to democrats subreddit

combined_df.head()

Unnamed: 0,subreddit,title,selftext
0,democrats,Hey All! I made a documentary about Joe Biden....,
1,democrats,It’s Time for a Blue Wave to Restore America.,
2,democrats,"In Scathing Letter, More Than 80-Percent of Fa...",
3,democrats,Mark Cuban endorses Biden on Hannity: He 'actu...,
4,democrats,Biden Surge and Trump Failures Expand Electora...,


In [42]:
# Check that it has been correctly merged. Bottom half will belong to republican subreddit

combined_df.tail()

Unnamed: 0,subreddit,title,selftext
1615,Republican,Forbes: Minneapolis Votes To Disband Police De...,
1616,Republican,"Why isn't it viral yet, PT.2",
1617,Republican,Pelosi Delivers Press Conference in Racist Garb,
1618,Republican,America Is In A Cultural Civil War,
1619,Republican,"Yahoo News: "" Why it might be time to finally ...",


In [43]:
# Check that datapoint collection are of correct size

combined_df.shape

(1620, 3)

### Clean the Merged Data

In [44]:
# Remove URL prefixes and numbers. The rest of special characters will be removed during modeling 

combined_df['title'] = combined_df['title'].str.replace(r'https://', '').replace(r'\d+', '')

In [45]:
# Do a routine check to ensure no abnormality after the clean

combined_df.head()

Unnamed: 0,subreddit,title,selftext
0,democrats,Hey All! I made a documentary about Joe Biden....,
1,democrats,It’s Time for a Blue Wave to Restore America.,
2,democrats,"In Scathing Letter, More Than 80-Percent of Fa...",
3,democrats,Mark Cuban endorses Biden on Hannity: He 'actu...,
4,democrats,Biden Surge and Trump Failures Expand Electora...,


### Evaluation of Missing Values

In [46]:
combined_df.isnull().sum()

subreddit       0
title           0
selftext     1488
dtype: int64

In [47]:
combined_df.shape

(1620, 3)

<font color=blue>From the above, we can see that only 8% of the selftext field contains text, hence a decision has been made to drop the data since there won't be much information loss. A lot of the posts contain headlines from articles. The good thing about headlines is that they usually capture the most newsworthy and salient summary of the article. Hence, the title feature alone would be deemed sufficient to carry words useful for prediction.

Therefore, the 1,620 datapoints would be deemed sufficient to generate sigficant result due to the above reason. 

</font>

In [48]:
# Drop the selftext column

combined_df.drop('selftext',axis=1,inplace=True)

### Save Merged Data

In [50]:
combined_df.to_csv('../project_data.csv', index=False, encoding='utf-8-sig')