# <span style= "color:SeaGreen">Project 3 - Web APIs & Classification</span>

## <span style= "color:SeaGreen">Introduction</span>

This notebook focus on the web scraping of information from Reddit.

## <span style= "color:SeaGreen">Libraries</span>

In [1]:
# Import Library
import requests
import pandas as pd
import numpy as np
import time
import random


# Additional Setting
pd.set_option('display.max_columns', 200)

In [2]:
# Optional setting to setup Jupyter
from IPython.display import display, HTML
display(HTML(data='''
<style>
div#notebook-container    { width: 80%; }
div#menubar-container     { width: 80%; }
div#maintoolbar-container { width: 80%; }
</style>
'''))

## <span style= "color:SeaGreen">User Defined Functions</span>

In [3]:
def extraction_api(url,subreddit, page_pull, col_2_nan):
    '''
    Function: To extract all posts on sub-reddit, page pull is determined by page pull
    
    Arguments:
    arg1 [string]: URL to pull from - must be .json
    arg2 [string]: The name of the file to save as
    arg3 [int]: Number of pages to pull data from
    arg4 [string]: 1 column name to replace '' as np.nan
    
    '''
    posts = []
    after = None

    for a in range(page_pull):
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after
        print(f'No.: {a +1} pull & url: {current_url}')
        res = requests.get(current_url, headers={'User-agent': 'Pony Inc 1.0'})

        if res.status_code != 200:
            print('Status error', res.status_code)
            break

        # Pulling from Reddit
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts.extend(current_posts)
        after = current_dict['data']['after']

        # Saving extraction into dataframe


        if a >= 0:
            try:
                ## Try to check if there is a extraction in folder, if yes, append new data to it
                prev_posts = pd.read_csv('datasets/'+ subreddit +'.csv')
                current_df = pd.DataFrame(posts)
                current_df = current_df.append(prev_posts, ignore_index = True)

            except:
                ## If no extraction file then current extraction will be save as the 1st extraction
                prev_posts = pd.DataFrame()
                current_df = pd.DataFrame(posts)
          

            ## remove dulicate, sort by latest post and save file
            current_df.drop_duplicates(subset=['title','selftext'], keep='first',inplace=True) 
            current_df.sort_values(by='created_utc',ascending=False,inplace=True)
            current_df.reset_index(drop=True, inplace=True)
            current_df.to_csv('datasets/'+ subreddit +'.csv', index = False)
            print(f'Len of old_list: {len(prev_posts)} & len of new_list: {len(current_df)}')

        # generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,5)
        print(f'Sleep Duration: {sleep_duration}')
        print('-' * 50)
        time.sleep(sleep_duration)
        
        # Once all pages are extracted
        if a+1 == page_pull: 
            current_df = pd.read_csv('datasets/'+ subreddit +'.csv')
            current_df[col_2_nan].replace('', np.nan, inplace=True)
            print('Extraction completed')
            #print(f'Total posts extracted: {len(posts)}')
        
    return current_df

## <span style= "color:SeaGreen">Extraction of 1st URL</span>

In [4]:
url1 = 'https://www.reddit.com/r/investing.json'
iv = extraction_api(url1,'investing',30,'selftext')

No.: 1 pull & url: https://www.reddit.com/r/investing.json
Len of old_list: 599 & len of new_list: 624
Sleep Duration: 3
--------------------------------------------------
No.: 2 pull & url: https://www.reddit.com/r/investing.json?after=t3_jgvh3c
Len of old_list: 624 & len of new_list: 648
Sleep Duration: 2
--------------------------------------------------
No.: 3 pull & url: https://www.reddit.com/r/investing.json?after=t3_jg8emr
Len of old_list: 648 & len of new_list: 672
Sleep Duration: 3
--------------------------------------------------
No.: 4 pull & url: https://www.reddit.com/r/investing.json?after=t3_jfjksz
Len of old_list: 672 & len of new_list: 696
Sleep Duration: 4
--------------------------------------------------
No.: 5 pull & url: https://www.reddit.com/r/investing.json?after=t3_jebaye
Len of old_list: 696 & len of new_list: 720
Sleep Duration: 2
--------------------------------------------------
No.: 6 pull & url: https://www.reddit.com/r/investing.json?after=t3_je0jly
L

In [8]:
# Check for duplication
print(f" The number of duplicated posts: {iv.duplicated(subset=['title','selftext']).sum()}")
print(f" The number of null posts: {iv['selftext'].isnull().sum()}")

 The number of duplicated posts: 0
 The number of null posts: 0


In [9]:
# Drop na in selftext => Image only posts
print(f'shape before dropping: {iv.shape}')
iv.drop_duplicates(subset=['title','selftext'], keep='first',inplace=True) 
iv.dropna(axis=0, how='any', subset = ['selftext'], inplace=True)
print(f'shape after dropping: {iv.shape}')

shape before dropping: (1090, 103)
shape after dropping: (1090, 103)


In [10]:
iv.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,subreddit_type,ups,total_awards_received,media_embed,author_flair_template_id,is_original_content,user_reports,secure_media,is_reddit_media_domain,is_meta,category,secure_media_embed,link_flair_text,can_mod_post,score,approved_by,author_premium,thumbnail,edited,author_flair_css_class,author_flair_richtext,gildings,content_categories,is_self,mod_note,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,view_count,archived,no_follow,is_crosspostable,pinned,over_18,all_awardings,awarders,media_only,can_gild,spoiler,locked,author_flair_text,treatment_tags,visited,removed_by,num_reports,distinguished,subreddit_id,mod_reason_by,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,mod_reports,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,author_cakeday
0,,investing,Technical analysis on Tesla for the week. We ...,t2_lj3n9,False,,0,False,Tesla Weekly Analysis - Week ending 10/24/2020,[],r/investing,False,6,,0,,True,t3_jh0aqu,False,dark,1.0,,public,1,0,{},,False,[],,False,False,,{},,False,1,,False,,False,,[],{},,True,,1603532000.0,text,6,,,text,self.investing,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,False,False,False,,[],False,,,,t5_2qhhq,,,,jh0aqu,True,,rexmakesbeats,,1,True,all_ads,False,[],False,,/r/investing/comments/jh0aqu/tesla_weekly_anal...,all_ads,False,https://www.reddit.com/r/investing/comments/jh...,1185618,1603503000.0,0,,False,
1,,investing,"Published 18 years ago, I was hesitant this bo...",t2_53g7qwfc,False,,0,False,Book review: Investing In Biotech,[],r/investing,False,6,,0,,True,t3_jh0a9u,False,dark,1.0,,public,1,0,{},,False,[],,False,False,,{},,False,1,,False,,False,,[],{},,True,,1603532000.0,text,6,,,text,self.investing,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,False,False,False,,[],False,,,,t5_2qhhq,,,,jh0a9u,True,,jdybka,,1,True,all_ads,False,[],False,,/r/investing/comments/jh0a9u/book_review_inves...,all_ads,False,https://www.reddit.com/r/investing/comments/jh...,1185618,1603503000.0,0,,False,
2,,investing,I'm willing to take some risk on all portfolio...,t2_rv3pk,False,,0,False,Need some advice for Porfollios,[],r/investing,False,6,,0,,True,t3_jh08np,False,dark,1.0,,public,1,0,{},,False,[],,False,False,,{},,False,1,,False,,False,,[],{},,True,,1603532000.0,text,6,,,text,self.investing,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,False,False,False,,[],False,,,,t5_2qhhq,,,,jh08np,True,,louissanchez84,,1,True,all_ads,False,[],False,,/r/investing/comments/jh08np/need_some_advice_...,all_ads,False,https://www.reddit.com/r/investing/comments/jh...,1185618,1603503000.0,0,,False,
3,,investing,I've posted this on r/wallstreetbets a couple ...,t2_1e3atzjp,False,,0,False,New(?) investing strategy?,[],r/investing,False,6,,0,,True,t3_jgzoym,False,dark,0.4,,public,0,0,{},,False,[],,False,False,,{},,False,0,,False,,False,,[],{},,True,,1603530000.0,text,6,,,text,self.investing,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,False,False,False,,[],False,,,,t5_2qhhq,,,,jgzoym,True,,ttyler1789,,4,True,all_ads,False,[],False,,/r/investing/comments/jgzoym/new_investing_str...,all_ads,False,https://www.reddit.com/r/investing/comments/jg...,1185618,1603501000.0,1,,False,
4,,investing,I also asked r/stocks but it's probably better...,t2_4uwlv6py,False,,0,False,"For those of you that invest in ""big name"" EV ...",[],r/investing,False,6,,0,,True,t3_jgz3pd,False,dark,1.0,,public,1,0,{},,False,[],,False,False,,{},,False,1,,False,,False,,[],{},,True,,1603527000.0,text,6,,,text,self.investing,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,False,False,False,,[],False,,,,t5_2qhhq,,,,jgz3pd,True,,ToKeepAndToHoldForev,,9,True,all_ads,False,[],False,,/r/investing/comments/jgz3pd/for_those_of_you_...,all_ads,False,https://www.reddit.com/r/investing/comments/jg...,1185618,1603499000.0,0,,False,True


## <span style= "color:SeaGreen">Extraction of 2nd URL</span>

In [11]:
url2 = 'https://www.reddit.com/r/personalfinance.json'
pf = extraction_api(url2,'personal_finance',30,'selftext')

No.: 1 pull & url: https://www.reddit.com/r/personalfinance.json
Len of old_list: 1159 & len of new_list: 1182
Sleep Duration: 2
--------------------------------------------------
No.: 2 pull & url: https://www.reddit.com/r/personalfinance.json?after=t3_jgymnd
Len of old_list: 1182 & len of new_list: 1207
Sleep Duration: 3
--------------------------------------------------
No.: 3 pull & url: https://www.reddit.com/r/personalfinance.json?after=t3_jgzke1
Len of old_list: 1207 & len of new_list: 1232
Sleep Duration: 2
--------------------------------------------------
No.: 4 pull & url: https://www.reddit.com/r/personalfinance.json?after=t3_jgxzuh
Len of old_list: 1232 & len of new_list: 1257
Sleep Duration: 2
--------------------------------------------------
No.: 5 pull & url: https://www.reddit.com/r/personalfinance.json?after=t3_jgwvef
Len of old_list: 1257 & len of new_list: 1282
Sleep Duration: 4
--------------------------------------------------
No.: 6 pull & url: https://www.reddi

In [12]:
# Check for duplication
print(f" The number of duplicated posts: {pf.duplicated(subset=['title','selftext']).sum()}")
print(f" The number of null posts: {pf['selftext'].isnull().sum()}")

 The number of duplicated posts: 0
 The number of null posts: 5


In [13]:
# Drop na in selftext => Image only posts
print(f'shape before dropping: {pf.shape}')
pf.drop_duplicates(subset=['title','selftext'], keep='first',inplace=True) 
pf.dropna(axis=0, how='any', subset = ['selftext'], inplace=True)
print(f'shape after dropping: {pf.shape}')

shape before dropping: (1906, 107)
shape after dropping: (1901, 107)


In [14]:
pf.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,subreddit_type,ups,total_awards_received,media_embed,author_flair_template_id,is_original_content,user_reports,secure_media,is_reddit_media_domain,is_meta,category,secure_media_embed,link_flair_text,can_mod_post,score,approved_by,author_premium,thumbnail,edited,author_flair_css_class,author_flair_richtext,gildings,content_categories,is_self,mod_note,created,link_flair_type,wls,removed_by_category,banned_by,author_flair_type,domain,allow_live_comments,selftext_html,likes,suggested_sort,banned_at_utc,view_count,archived,no_follow,is_crosspostable,pinned,over_18,all_awardings,awarders,media_only,link_flair_template_id,can_gild,spoiler,locked,author_flair_text,treatment_tags,visited,removed_by,num_reports,distinguished,subreddit_id,mod_reason_by,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,mod_reports,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,author_cakeday,crosspost_parent_list,url_overridden_by_dest,crosspost_parent
0,,personalfinance,\n\nHello all. I am currently interviewing for...,t2_13qifymb,False,,0,False,Negotiating pay for a job offer,[],r/personalfinance,False,6,Employment,0,,True,t3_jh0fk5,False,light,1.0,,public,1,0,{},,False,[],,False,False,,{},Employment,False,1,,False,,False,,[],{},,True,,1603533000.0,text,6,,,text,self.personalfinance,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,0bdd11d2-c078-11e4-aeac-22000b3d8247,False,False,False,​,[],False,,,,t5_2qstm,,,#8233b7,jh0fk5,True,,laurabell114,,0,True,all_ads,False,[],False,dark,/r/personalfinance/comments/jh0fk5/negotiating...,all_ads,False,https://www.reddit.com/r/personalfinance/comme...,14205371,1603504000.0,0,,False,,,,
1,,personalfinance,Where do I start? I'm really new on this world...,t2_5pbbv31d,False,,0,False,How to start investing in the market stocks?,[],r/personalfinance,False,6,Investing,0,,True,t3_jh07zq,False,light,0.75,,public,2,0,{},,False,[],,False,False,,{},Investing,False,2,,False,,False,,[],{},,True,,1603532000.0,text,6,,,text,self.personalfinance,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,True,False,False,False,[],[],False,1c57f8a6-c078-11e4-9e41-22000b39cb96,False,False,False,​,[],False,,,,t5_2qstm,,,#da333f,jh07zq,True,,The_Freeman_10,,2,True,all_ads,False,[],False,dark,/r/personalfinance/comments/jh07zq/how_to_star...,all_ads,False,https://www.reddit.com/r/personalfinance/comme...,14205371,1603503000.0,0,,False,,,,
2,,personalfinance,Hi everyone\n\nWe are in the middle of closing...,t2_75ty2fnn,False,,0,False,New Home Purchase - Mortgage Underwriting/Clos...,[],r/personalfinance,False,6,Housing,0,,True,t3_jh07p8,False,light,1.0,,public,1,0,{},,False,[],,False,False,,{},Housing,False,1,,False,,1603503377.0,,[],{},,True,,1603532000.0,text,6,,,text,self.personalfinance,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,1033dbd0-c078-11e4-b0f1-22000b3d8247,False,False,False,​,[],False,,,,t5_2qstm,,,#c313d3,jh07p8,True,,TheTuxdude,,1,True,all_ads,False,[],False,dark,/r/personalfinance/comments/jh07p8/new_home_pu...,all_ads,False,https://www.reddit.com/r/personalfinance/comme...,14205371,1603503000.0,0,,False,,,,
3,,personalfinance,Hi - I made too much money (over $137k limit) ...,t2_2rb26qa2,False,,0,False,Made too much money for Roth IRA (commission j...,[],r/personalfinance,False,6,Retirement,0,,True,t3_jh004b,False,dark,1.0,,public,1,0,{},,False,[],,False,False,,{},Retirement,False,1,,False,,False,,[],{},,True,,1603531000.0,text,6,,,text,self.personalfinance,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,,False,False,False,​,[],False,,,,t5_2qstm,,,,jh004b,True,,SalesGuyBurnerAcct,,2,True,all_ads,False,[],False,dark,/r/personalfinance/comments/jh004b/made_too_mu...,all_ads,False,https://www.reddit.com/r/personalfinance/comme...,14205371,1603502000.0,0,,False,,,,
4,,personalfinance,So I was thinking of getting a car that qualif...,t2_u9i67zc,False,,0,False,Plug in hybrid tax credit,[],r/personalfinance,False,6,Auto,0,,True,t3_jh000d,False,light,1.0,,public,1,0,{},,False,[],,False,False,,{},Auto,False,1,,False,,False,,[],{},,True,,1603531000.0,text,6,,,text,self.personalfinance,False,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",,,,,False,False,False,False,False,[],[],False,ea2c76fe-c077-11e4-afea-22000bb2c1d0,False,False,False,​,[],False,,,,t5_2qstm,,,#19a53f,jh000d,True,,Stryker3414,,3,True,all_ads,False,[],False,dark,/r/personalfinance/comments/jh000d/plug_in_hyb...,all_ads,False,https://www.reddit.com/r/personalfinance/comme...,14205371,1603502000.0,0,,False,,,,


## <span style= "color:SeaGreen">Combine & Save File</span>

In [15]:
print(f'Shape of iv: {iv.shape}')
print(f'Shape of pf: {pf.shape}')
iv_cols = set(iv.columns)
pf_cols = set(pf.columns)

Shape of iv: (1090, 103)
Shape of pf: (1901, 107)


In [16]:
# Are the additional columns relevant? 
pf_cols.difference(iv_cols)

{'crosspost_parent',
 'crosspost_parent_list',
 'link_flair_template_id',
 'url_overridden_by_dest'}

In [17]:
retain_cols = iv_cols.intersection(pf_cols)

In [18]:
iv = iv[retain_cols]
pf = pf[retain_cols]

In [19]:
print(f'Num rows in iv: {iv.shape[0]}, Num rows in pf: {pf.shape[0]}')
combined = iv.append(pf, ignore_index=True)
print(f'combined shape: {combined.shape[0]}')

Num rows in iv: 1090, Num rows in pf: 1901
combined shape: 2991


In [20]:
combined.to_csv('datasets/combined.csv', index =False)