# DATA COLLECTION
The two subreddits of choice are:
- PC Gaming (https://www.reddit.com/r/pcgaming/)
- Console Gaming (https://www.reddit.com/r/consoles/)


Due to difficulties of bypassing 1000 posts limit with based Pushshift API, PMAW: Pushshift Multithread API Wrapper was utilized to retrieve a limit of 10000 posts from each subreddit.

The raw data requested are stored in pcgaming.csv and consoles.csv, while the data of interest has been extracted from each set, combined, and stored in submissions.csv.

In [1]:
# pip install pmaw
## Installed PMAW API wrapper for pushshift.io

In [2]:
# Initial imports
import pandas as pd
from pmaw import PushshiftAPI

INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [None]:
# # ORIGINAL LOOP FOR MULTIPLE PUSHSHIFT REQUESTS
# # WAS UNABLE TO FIGURE OUT HOW TO GET PAST 999 POSTS CAP
# # THEREFORE USING API WRAPPER INSTEAD

# # Method to requesting submissions

# def submissions(subreddit='', size=100, loop=1, file_name='untitled'):
#     url = 'https://api.pushshift.io/reddit/search/submission'
#     data = []
#     before = None
#     count = 0
    
#     # Repeat request
#     for i in range(loop):
        
#         # Parameters defined
#         params = {
#         'subreddit': subreddit,
#         'size': size,
#         'before': before
#         } 
        
#         # Request data
#         res = requests.get(url, params)
        
#         # End process and update if request fails
#         if res.status_code != 200:
#             return "Request status not 200"
#             break

#         # Store data requests
#         data.extend(res.json()['data'])
        
#         # Store date of last post "['created_utc'].iloc[-1]"
#         # Updates 'before' parameter each request for older posts
        
#         # End loop if we've reached the end of the subreddit
#         if before == pd.DataFrame(res.json()['data'])['created_utc'].iloc[-1]:
#             print(f'Reached the end of subreddit at {len(data)} posts.')
#             break
#         else:
#             before = pd.DataFrame(res.json()['data'])['created_utc'].iloc[-1]
        
#         # Updates
#         if len(data) % 1000 == 0:
#             count += 1000
#             print(f'{count} submissions requested')
        
#         # Random pauses per request to avoid bot detection
#         random_time = random.randint(2,10)
#         time.sleep(random_time)
    
#     # Raw data converted to DataFrame and stored into CSV
#     df = pd.DataFrame(data)
#     df.to_csv(f'../data/{file_name}.csv')
    
#     # Update
#     print("Request complete.")

In [7]:
# PMAW API Wrapper
api = PushshiftAPI()

# Request, 'mem_safe=True' to avoid memory error due to large request
pcgaming_raw = api.search_submissions(subreddit="pcgaming", limit = 10000, mem_safe=True)
consoles_raw = api.search_submissions(subreddit="consoles", limit = 10000, mem_safe=True)

# Stored
pcgaming = pd.DataFrame(pcgaming_raw)
consoles = pd.DataFrame(consoles_raw)

# Exported
pcgaming.to_csv('../data/pcgaming.csv')
consoles.to_csv('../data/consoles.csv')

INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 100.00% - Requests: 100 - Batches: 10 - Items Remaining: 124
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 102 - Batches: 11 - Items Remaining: 0
INFO:pmaw.PushshiftAPIBase:4538 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 95.00% - Requests: 20 - Batches: 3 - Items Remaining: 4241
INFO:pmaw.PushshiftAPIBase:1 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 97.37% - Requests: 38 - Batches: 6 - Items Remaining: 2835
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 97.92% - Requests: 48 - Batches: 7 - Items Remaining: 2352
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 98.28% - Requests: 58 - Batches: 8 - Items Remaining: 1847
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 98.53% - Requests: 68 - Batches: 9 - Items Remaining: 1162
INFO:pmaw.PushshiftAPIBase:Checkpoint:: Success Rate: 98.72% - Requests: 78 - Batches: 10 - Items Remaining: 6

In [19]:
# Confirmed all posts are exclusive, no duplicates

print('PC GAMING:')
print(pcgaming.shape)
print(pcgaming.astype(str).drop_duplicates().shape)
print('CONSOLE GAMING')
print(consoles.shape)
print(consoles.astype(str).drop_duplicates().shape)

## Seems console gaming's subreddit only had a stotal of 5461 posts

PC GAMING:
(10000, 79)
(10000, 79)
CONSOLE GAMING
(5461, 98)
(5461, 98)


In [26]:
# Quick look
pcgaming.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,secure_media,secure_media_embed,author_flair_template_id,suggested_sort,is_created_from_ads_ui,author_is_blocked,banned_by,author_cakeday,edited,link_flair_template_id
0,[],False,cedricxcs,,[],,text,t2_6cnvxl3f,False,False,...,,,,,,,,,,
1,[],False,ImCoolAlmost,,[],,text,t2_r09stcm,False,False,...,,,,,,,,,,
2,[],False,fastforward23,,[],,text,t2_aqhvu,False,False,...,,,,,,,,,,
3,[],False,cheese_grater5005,,[],,text,t2_774oapey,False,False,...,,,,,,,,,,
4,[],False,SuryaPandurangi,,[],,text,t2_1kxz4c7s,False,False,...,,,,,,,,,,


In [27]:
# Quick look
consoles.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,og_title,removed_by,brand_safe,rte_mode,gilded,author_id,author_created_utc,approved_at_utc,banned_at_utc,view_count
0,[],False,PinkiPai222,,[],,text,t2_8eqfeb60,False,False,...,,,,,,,,,,
1,[],False,darren_mcweeden,,[],,text,t2_s1anl,False,False,...,,,,,,,,,,
2,[],False,Acceptable-Big-8654,,[],,text,t2_5go120a8,False,False,...,,,,,,,,,,
3,[],False,tekguy1982,,[],,text,t2_bojgf8t5,False,False,...,,,,,,,,,,
4,[],False,ccaiiden,,[],,text,t2_4wuv6ne7,False,False,...,,,,,,,,,,


In [28]:
pcgaming.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_text', 'link_flair_text_color',
       'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments',
       'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink',
       'pinned', 'pwls', 'removed_by_category', 'retrieved_on', 'score',
       'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail',
       '

**Took at look at column names and the json formatted URLs (https://api.pushshift.io/reddit/search/submission?subreddit=consoles and https://api.pushshift.io/reddit/search/submission?subreddit=pcgaming) to review columns of interests:**
- subreddit
- title
- selftext
- url

**The title and selftext has the most information that will be used for natural language processing.**

In [85]:
# Extracted preferred columns
df_pc = pcgaming[['subreddit','title', 'selftext']]
df_cons = consoles[['subreddit','title', 'selftext']]

In [86]:
# Review
print(df_pc.shape)
df_pc.head()

(10000, 3)


Unnamed: 0,subreddit,title,selftext
0,pcgaming,Cat loves to jump on my pc,[removed]
1,pcgaming,need help with hacked epic account,[removed]
2,pcgaming,Auto HDR Preview for PC Available Today | Dire...,
3,pcgaming,Accidentally pissed on my gaming pc now im rea...,[removed]
4,pcgaming,Are Black Ops 1 and WaW Active on PC,"Hi all,\n\n Are the Classic Black Ops titles a..."


In [74]:
print(df_cons.shape)
df_cons.head()

(5461, 3)


Unnamed: 0,subreddit,title,selftext
0,consoles,I want to sell 2x Never opened PlayStation Vit...,[removed]
1,consoles,Was finally able to set some of these babies up,
2,consoles,Ps5 trade Xbox series x,
3,consoles,Bought My First Ever Playsation,
4,consoles,Xbox series s or ps4 pro,I’m deciding wether to get a xbox ss or a ps4 ...


**Despite there being blank or removed selftext's, the title is still useful.**

In [82]:
# Combined and exported
submissions = pd.concat([df_pc,df_cons]).reset_index(drop=True)
submissions.to_csv('../data/submissions.csv', index=False)

print(submissions.shape)

(15461, 3)
