# Retrieve discussions from the databases

In [1]:
import networkx as nx
from fa2 import ForceAtlas2  #the package should be installed before
import matplotlib.pyplot as plt
import datetime
from statistics import mean, median, quantiles
from networkx.algorithms.traversal.breadth_first_search import descendants_at_distance
import time
import requests
import glob
import pickle
import json
import zstandard
import pandas as pd

In [2]:
#zst_files: it's the list of the paths to the zst compressed files  
#subreddit_list: its a list of the subreddit names you want to scrape

In [3]:
def decompose_zstd_streaming(zst_files, subreddits_list):
    for file_no,zst_file_path in enumerate(zst_files):
        slist=[]
        df_out=pd.DataFrame()
        count=0
        chunk_count=0
        iteration=0
        program_starts = time.time()
        name_aux = zst_file_path.split('/')[-1]
        name=name_aux.split('.')[0]
        print('\nProcessing File:',name,'  ', file_no+1,'/',len(zst_files))
        with open(zst_file_path, 'rb') as fh:
            dctx = zstandard.ZstdDecompressor(max_window_size=2147483648)
            with dctx.stream_reader(fh) as reader:
                previous_line = ""
                while True:
                    now = time.time()
                    chunk = reader.read(2**24)  # 16mb chunks
                    chunk_count+=1
                    if not chunk:
                        break

                    string_data = chunk.decode('utf-8')
                    lines = string_data.split("\n")
                    for i, line in enumerate(lines[:-1]):
                        if i == 0:
                            line = previous_line + line
                        object_chunk = json.loads(line)
                        count+=1

                        if any(object_chunk['subreddit'] in s for s in subreddits_list):
                            slist.append(object_chunk)

                    if divmod(count,5000000)[0]>iteration:
                        iteration=divmod(count,5000000)[0]
                        if iteration%5==0:
                            df_out=pd.DataFrame(slist)
                            df_out.to_csv(name+"_"+str(iteration)+".csv")
                            slist=[]
                        print('')
                        print("|t:",str(datetime.timedelta(seconds=(now - program_starts))).split('.')[0],
                              '|\t|Saved Rows:',len(slist)/1000,'K',
                              '|\t|Raw Rows:',count/1000,'K',
                              '|\t|',chunk_count*16,'MB Proccesed|')

                    #if count > 800000: break
                    previous_line = lines[-1]


        df_out=pd.DataFrame(slist)
        df_out.to_csv(f'{name}.csv')

In [4]:
file_paths = glob.glob("monthly_archive/RC*.zst")

In [5]:
file_paths

['monthly_archive/RC_2019-08.zst',
 'monthly_archive/RC_2019-09.zst',
 'monthly_archive/RC_2019-07.zst',
 'monthly_archive/RC_2019-12.zst',
 'monthly_archive/RC_2019-10.zst',
 'monthly_archive/RC_2019-11.zst']

In [6]:
decompose_zstd_streaming(file_paths,["politics","PoliticalDiscussion","puppies"])


Processing File: RC_2019-08    1 / 6

|t: 0:02:09 |	|Saved Rows: 121.022 K |	|Raw Rows: 5000.093 K |	| 5936 MB Proccesed|

|t: 0:04:18 |	|Saved Rows: 183.524 K |	|Raw Rows: 10012.71 K |	| 11872 MB Proccesed|

|t: 0:06:25 |	|Saved Rows: 234.529 K |	|Raw Rows: 15007.607 K |	| 17760 MB Proccesed|

|t: 0:08:34 |	|Saved Rows: 325.03 K |	|Raw Rows: 20000.359 K |	| 23664 MB Proccesed|

|t: 0:10:44 |	|Saved Rows: 0.0 K |	|Raw Rows: 25001.839 K |	| 29616 MB Proccesed|

|t: 0:13:15 |	|Saved Rows: 79.901 K |	|Raw Rows: 30007.518 K |	| 35584 MB Proccesed|

|t: 0:15:26 |	|Saved Rows: 154.226 K |	|Raw Rows: 35005.133 K |	| 41536 MB Proccesed|

|t: 0:17:37 |	|Saved Rows: 228.565 K |	|Raw Rows: 40002.461 K |	| 47472 MB Proccesed|

|t: 0:19:45 |	|Saved Rows: 304.615 K |	|Raw Rows: 45003.85 K |	| 53408 MB Proccesed|

|t: 0:21:56 |	|Saved Rows: 0.0 K |	|Raw Rows: 50006.952 K |	| 59312 MB Proccesed|

|t: 0:24:25 |	|Saved Rows: 63.603 K |	|Raw Rows: 55005.051 K |	| 65232 MB Proccesed|

|t: 0:26:33 |	|Save


|t: 0:29:12 |	|Saved Rows: 261.285 K |	|Raw Rows: 45006.955 K |	| 55248 MB Proccesed|

|t: 0:32:28 |	|Saved Rows: 0.0 K |	|Raw Rows: 50001.012 K |	| 61392 MB Proccesed|

|t: 0:36:08 |	|Saved Rows: 87.18 K |	|Raw Rows: 55012.33 K |	| 67552 MB Proccesed|

|t: 0:39:14 |	|Saved Rows: 208.127 K |	|Raw Rows: 60011.606 K |	| 73680 MB Proccesed|

|t: 0:42:25 |	|Saved Rows: 291.366 K |	|Raw Rows: 65002.359 K |	| 79808 MB Proccesed|

|t: 0:45:34 |	|Saved Rows: 342.444 K |	|Raw Rows: 70002.191 K |	| 85920 MB Proccesed|

|t: 0:48:54 |	|Saved Rows: 0.0 K |	|Raw Rows: 75007.818 K |	| 92048 MB Proccesed|

|t: 0:52:41 |	|Saved Rows: 65.55 K |	|Raw Rows: 80008.843 K |	| 98208 MB Proccesed|

|t: 0:55:58 |	|Saved Rows: 173.547 K |	|Raw Rows: 85011.787 K |	| 104368 MB Proccesed|

|t: 0:59:17 |	|Saved Rows: 394.833 K |	|Raw Rows: 90001.053 K |	| 110496 MB Proccesed|

|t: 1:02:34 |	|Saved Rows: 515.286 K |	|Raw Rows: 95004.695 K |	| 116640 MB Proccesed|

|t: 1:05:45 |	|Saved Rows: 0.0 K |	|Raw Rows: 100003

In [15]:
df = pd.read_csv(file_paths[0])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [16]:
df

Unnamed: 0.1,Unnamed: 0,all_awardings,archived,associated_award,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_template_id,author_flair_text,...,top_awarded_type,total_awards_received,treatment_tags,author_flair_richtext,author_flair_type,author_fullname,author_patreon_flair,author_premium,author_cakeday,editable
0,0,[],False,,[deleted],,,,,,...,,0.0,[],,,,,,,
1,1,[],False,,Orwick,1.391507e+09,,,,,...,,0.0,[],[],text,t2_f47wy,False,False,,
2,2,[],False,,[deleted],,,,,,...,,0.0,[],,,,,,,
3,3,[],False,,d_j_smith,1.362153e+09,,,,,...,,0.0,[],[],text,t2_as190,False,False,,
4,4,[],False,,[deleted],,,,,,...,,0.0,[],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156806,1134737,[],False,,Timberwolve17,1.499812e+09,,,,,...,,0.0,[],[],text,t2_1q6e51,False,False,,
1156807,1134738,[],False,,feetpicsbot,1.493235e+09,,,,,...,,0.0,[],[],text,t2_17fmy5,False,False,,
1156808,1134739,[],False,,[deleted],,,,,,...,,0.0,[],,,,,,,
1156809,1134740,[],False,,glibgloby,1.540714e+09,#edeff1,california-flag,3d538282-8e71-11e6-bf2e-0e0d983a7ee7,:flag-ca: California,...,,0.0,[],"[{'a': ':flag-ca:', 'e': 'emoji', 'u': 'https:...",richtext,t2_2hsfqfpj,False,False,,


In [7]:
df.columns

Index(['Unnamed: 0', 'all_awardings', 'associated_award', 'author',
       'author_created_utc', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_fullname',
       'awarders', 'body', 'can_gild', 'can_mod_post', 'collapsed',
       'collapsed_because_crowd_control', 'collapsed_reason',
       'controversiality', 'created_utc', 'distinguished', 'edited', 'gilded',
       'gildings', 'id', 'is_submitter', 'link_id', 'locked', 'no_follow',
       'parent_id', 'permalink', 'quarantined', 'removal_reason',
       'retrieved_on', 'score', 'send_replies', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_name_prefixed', 'subreddit_type',
       'top_awarded_type', 'total_awards_received', 'treatment_tags',
       'author_flair_richtext', 'author_flair_type', 'author_patreon_flair',
       'author_premium', 'author_cakeday', 'comment_type'],
      dtype='object')

In [18]:
df_cleaned = df[["score", "author_fullname","body","created_utc","id","parent_id","subreddit", "subreddit_id"]]
df_cleaned.head()

Unnamed: 0,score,author_fullname,body,created_utc,id,parent_id,subreddit,subreddit_id
0,1,,[removed],1622505600,h04xy3e,t3_npb971,politics,t5_2cneq
1,3,t2_f47wy,A lot of states are that way. Not paying your ...,1622505600,h04xy3i,t1_h04vks7,politics,t5_2cneq
2,1,,[removed],1622505601,h04xy77,t1_h04vrep,politics,t5_2cneq
3,7,t2_as190,They haven't stopped the tide of prosecutions ...,1622505602,h04xy83,t1_h04801t,politics,t5_2cneq
4,1,,[removed],1622505606,h04xyix,t3_npgn0r,politics,t5_2cneq


In [17]:
df["subreddit"].unique()

array(['politics', 'China', 'PoliticalDiscussion', 'China_Flu', 'puppies',
       'football', 'Discussion', 'foot', 'COVID19', 'pie', 'es', 'it',
       'ca', nan, 'pup', 'oot', 'uss', 'Poli'], dtype=object)

In [50]:
df_cleaned_sub = df_cleaned[ (df_cleaned["subreddit"]=="PoliticalDiscussion") & (df_cleaned["body"]!= "[removed]") & (df_cleaned["body"]!= "[deleted]")]
df_cleaned_sub["id"]="t1_"+df_cleaned_sub["id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_sub["id"]="t1_"+df_cleaned_sub["id"]


In [51]:
df_cleaned_sub

Unnamed: 0,score,author_fullname,body,created_utc,id,parent_id,subreddit,subreddit_id
64,3,t2_8po26,New York City has actual houses with yards in ...,1622505772,t1_h04y9gl,t1_h01ju0s,PoliticalDiscussion,t5_2sfmf
81,9,t2_66wev,"It's a fair point, but you would ideally want ...",1622505814,t1_h04yc9b,t1_h04vwlw,PoliticalDiscussion,t5_2sfmf
82,1,t2_657qrcwo,The Democrats want their power bases to play t...,1622505816,t1_h04yccr,t3_np8fjs,PoliticalDiscussion,t5_2sfmf
116,17,t2_ibsfn,"They're on the same page for the most part, th...",1622505902,t1_h04yhy8,t3_np8fjs,PoliticalDiscussion,t5_2sfmf
161,8,t2_109kok4i,"35 out 211 house members, and 6 out of 48 sena...",1622506001,t1_h04yoh8,t1_h04xoxt,PoliticalDiscussion,t5_2sfmf
...,...,...,...,...,...,...,...,...
1156635,3,t2_ad6n7px9,"I feel the last example, someone publishing wi...",1625097325,t1_h3mfchn,t1_h3m9mf4,PoliticalDiscussion,t5_2sfmf
1156694,5,t2_2mrpiafa,"donations came from out of state, easy",1625097426,t1_h3mfjri,t1_h3mckq8,PoliticalDiscussion,t5_2sfmf
1156748,1,t2_9p37t2rb,&amp;#x200B;\n\n[https://nypost.com/2021/05/26...,1625097506,t1_h3mfpjb,t1_h3me759,PoliticalDiscussion,t5_2sfmf
1156783,2,t2_4dpohuv,&gt; The reason why people bring up the circum...,1625097555,t1_h3mfszq,t1_h3g3q3g,PoliticalDiscussion,t5_2sfmf


In [30]:
file_paths = glob.glob("monthly_archive/dataRetrieved/RS_2021-06*")

In [31]:
file_paths

['monthly_archive/dataRetrieved/RS_2021-06.csv']

In [32]:
df_1 = pd.read_csv(file_paths[0])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [33]:
df_1

Unnamed: 0.1,Unnamed: 0,all_awardings,allow_live_comments,archived,author,author_created_utc,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,...,post_hint,preview,gallery_data,is_gallery,media_metadata,crosspost_parent,crosspost_parent_list,author_cakeday,collections,poll_data
0,0,[],True,False,mr_mcpoogrundle,1.494086e+09,,,[],,...,,,,,,,,,,
1,1,[],True,False,duderos,1.441566e+09,,,[],,...,link,"{'enabled': False, 'images': [{'id': 'oB4SKhJG...",,,,,,,,
2,2,[],False,False,DoomsDaySloth,1.621893e+09,,,[],,...,,,,,,,,,,
3,3,[],False,False,[deleted],,,,,,...,,,,,,,,,,
4,4,[],False,False,[deleted],,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19566,19566,[],False,False,[deleted],,,,,,...,,,,,,,,,,
19567,19567,[],False,False,checkmak01,1.558846e+09,,,[],,...,,,,,,,,,,
19568,19568,[],False,False,MattsApocalypticLife,1.607570e+09,,,[],,...,,,,,,,,,,
19569,19569,[],False,False,[deleted],,,,,,...,,,,,,,,,,


In [34]:
df_1.columns

Index(['Unnamed: 0', 'all_awardings', 'allow_live_comments', 'archived',
       'author', 'author_created_utc', 'author_flair_background_color',
       'author_flair_css_class', 'author_flair_richtext',
       'author_flair_template_id', 'author_flair_text',
       'author_flair_text_color', 'author_flair_type', 'author_fullname',
       'author_patreon_flair', 'author_premium', 'can_gild', 'category',
       'content_categories', 'contest_mode', 'created_utc', 'discussion_type',
       'distinguished', 'domain', 'edited', 'gilded', 'gildings', 'hidden',
       'hide_score', 'id', 'is_created_from_ads_ui', 'is_crosspostable',
       'is_meta', 'is_original_content', 'is_reddit_media_domain',
       'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media',
       'media_embed', 'media_only', 'nam

In [44]:
df_1_cleaned = df_1[["score", "author_fullname","title","name","num_comments","subreddit_subscribers","created_utc","id","subreddit", "subreddit_id"]]
df_1_cleaned.head()

Unnamed: 0,score,author_fullname,title,name,num_comments,subreddit_subscribers,created_utc,id,subreddit,subreddit_id
0,9,t2_gjiw4c,Michael Flynn agreed a Myanmar-style coup shou...,t3_npgvrm,6,7577805,1622505718,npgvrm,politics,t5_2cneq
1,1227,t2_q7szq,Cheney fires back at Flynn over coup remark,t3_npgxwa,123,7577805,1622505897,npgxwa,politics,t5_2cneq
2,1,t2_cbbr6m5u,How does everyone feel about Bidens recent com...,t3_nphimg,2,1333821,1622507654,nphimg,PoliticalDiscussion,t5_2sfmf
3,1,,"Bat Signed by Michael Flynn Sells for $8,000 a...",t3_nphkiq,1,7577805,1622507820,nphkiq,politics,t5_2cneq
4,0,,Biden shows little desire to reverse Trump's C...,t3_nphlqo,4,7577805,1622507931,nphlqo,politics,t5_2cneq


In [46]:
df_1_cleaned_sub = df_1_cleaned[ (df_1_cleaned["subreddit"]=="PoliticalDiscussion")]# & (df_1_cleaned["body"]!= "[removed]") & (df_1_cleaned["body"]!= "[deleted]")]

In [47]:
df_1_cleaned_sub

Unnamed: 0,score,author_fullname,title,name,num_comments,subreddit_subscribers,created_utc,id,subreddit,subreddit_id
2,1,t2_cbbr6m5u,How does everyone feel about Bidens recent com...,t3_nphimg,2,1333821,1622507654,nphimg,PoliticalDiscussion,t5_2sfmf
7,1,t2_98z1fuic,QAnon's Wildest Moments From Their Massively D...,t3_nphos0,1,1333821,1622508194,nphos0,PoliticalDiscussion,t5_2sfmf
8,1,t2_98z1fuic,QAnon's Wildest Moments From Their Massively D...,t3_nphpud,2,1333821,1622508292,nphpud,PoliticalDiscussion,t5_2sfmf
12,1,t2_11u13r,Where is the GOP Healthcare Plan? Conservative...,t3_nphyep,2,1333821,1622509117,nphyep,PoliticalDiscussion,t5_2sfmf
20,39,t2_122z72,Some questions for both sides on debates aroun...,t3_npiewd,172,1333821,1622510652,npiewd,PoliticalDiscussion,t5_2sfmf
...,...,...,...,...,...,...,...,...,...,...
19435,1,t2_1629yw,How Damaging will the Botched Primary Vote Cou...,t3_ob63cf,2,1362138,1625085715,ob63cf,PoliticalDiscussion,t5_2sfmf
19442,1,t2_82ucqipu,Diaries in the Loony Bin,t3_ob6bbh,2,1362138,1625086365,ob6bbh,PoliticalDiscussion,t5_2sfmf
19468,1,t2_czg6fsa7,EPSTEIN| $WHACKD | Recently launched BSC token...,t3_ob6vuo,1,1362139,1625088086,ob6vuo,PoliticalDiscussion,t5_2sfmf
19495,1,t2_3to8k94v,"If everyone paid the same percentage in taxes,...",t3_ob7p5v,1,1362140,1625090544,ob7p5v,PoliticalDiscussion,t5_2sfmf


In [57]:
df_1_cleaned_sub.name.unique().size

991

In [59]:
aux= pd.merge(df_1_cleaned_sub.name, df_cleaned_sub.parent_id, how="inner", left_on="name", right_on="parent_id")

In [61]:
aux.name.unique().size

988

In [65]:
df_cleaned_sub.head()

Unnamed: 0,score,author_fullname,body,created_utc,id,parent_id,subreddit,subreddit_id
64,3,t2_8po26,New York City has actual houses with yards in ...,1622505772,t1_h04y9gl,t1_h01ju0s,PoliticalDiscussion,t5_2sfmf
81,9,t2_66wev,"It's a fair point, but you would ideally want ...",1622505814,t1_h04yc9b,t1_h04vwlw,PoliticalDiscussion,t5_2sfmf
82,1,t2_657qrcwo,The Democrats want their power bases to play t...,1622505816,t1_h04yccr,t3_np8fjs,PoliticalDiscussion,t5_2sfmf
116,17,t2_ibsfn,"They're on the same page for the most part, th...",1622505902,t1_h04yhy8,t3_np8fjs,PoliticalDiscussion,t5_2sfmf
161,8,t2_109kok4i,"35 out 211 house members, and 6 out of 48 sena...",1622506001,t1_h04yoh8,t1_h04xoxt,PoliticalDiscussion,t5_2sfmf


In [66]:
df_1_cleaned_sub.head()

Unnamed: 0,score,author_fullname,title,name,num_comments,subreddit_subscribers,created_utc,id,subreddit,subreddit_id
2,1,t2_cbbr6m5u,How does everyone feel about Bidens recent com...,t3_nphimg,2,1333821,1622507654,nphimg,PoliticalDiscussion,t5_2sfmf
7,1,t2_98z1fuic,QAnon's Wildest Moments From Their Massively D...,t3_nphos0,1,1333821,1622508194,nphos0,PoliticalDiscussion,t5_2sfmf
8,1,t2_98z1fuic,QAnon's Wildest Moments From Their Massively D...,t3_nphpud,2,1333821,1622508292,nphpud,PoliticalDiscussion,t5_2sfmf
12,1,t2_11u13r,Where is the GOP Healthcare Plan? Conservative...,t3_nphyep,2,1333821,1622509117,nphyep,PoliticalDiscussion,t5_2sfmf
20,39,t2_122z72,Some questions for both sides on debates aroun...,t3_npiewd,172,1333821,1622510652,npiewd,PoliticalDiscussion,t5_2sfmf


In [67]:
df_merged = pd.merge(df_cleaned_sub, df_1_cleaned_sub, how="left", left_on="parent_id", right_on="name")

In [73]:
df_merged[df_merged["parent_id"].str.contains("t3_")]

Unnamed: 0,score_x,author_fullname_x,body,created_utc_x,id_x,parent_id,subreddit_x,subreddit_id_x,score_y,author_fullname_y,title,name,num_comments,subreddit_subscribers,created_utc_y,id_y,subreddit_y,subreddit_id_y
2,1,t2_657qrcwo,The Democrats want their power bases to play t...,1622505816,t1_h04yccr,t3_np8fjs,PoliticalDiscussion,t5_2sfmf,,,,,,,,,,
3,17,t2_ibsfn,"They're on the same page for the most part, th...",1622505902,t1_h04yhy8,t3_np8fjs,PoliticalDiscussion,t5_2sfmf,,,,,,,,,,
7,83,t2_sk85xg4,The Republicans couldn't get all the votes in ...,1622506174,t1_h04yzx2,t3_np8fjs,PoliticalDiscussion,t5_2sfmf,,,,,,,,,,
12,-9,t2_9k9no7g1,Bring back the draft so that the military is c...,1622506707,t1_h04zyxm,t3_np6snu,PoliticalDiscussion,t5_2sfmf,,,,,,,,,,
16,329,t2_9jted,I know that when I got my security clearance t...,1622506921,t1_h050d3o,t3_np6snu,PoliticalDiscussion,t5_2sfmf,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29422,77,t2_8ybry,"The law doesn't have many ""one weird trick!"" l...",1625092955,t1_h3m6meh,t3_oayrlj,PoliticalDiscussion,t5_2sfmf,44.0,t2_z2s6nf9,First Amendment Restrictions,t3_oayrlj,107.0,1362136.0,1.625064e+09,oayrlj,PoliticalDiscussion,t5_2sfmf
29423,0,t2_c1g5efy,"Yeah, meanwhile Americans are so stupid fighti...",1625093009,t1_h3m6qa4,t3_o8wwwi,PoliticalDiscussion,t5_2sfmf,375.0,t2_mhaif74,"How likely is a second space race, this time b...",t3_o8wwwi,324.0,1362113.0,1.624800e+09,o8wwwi,PoliticalDiscussion,t5_2sfmf
29424,33,t2_bbkpr,1. You can bet that they won't ever use actual...,1625093084,t1_h3m6vt4,t3_oarmdv,PoliticalDiscussion,t5_2sfmf,114.0,t2_capqx4tk,The New York Mayoral Race Has Been Thrown Into...,t3_oarmdv,111.0,1362135.0,1.625036e+09,oarmdv,PoliticalDiscussion,t5_2sfmf
29453,20,t2_184xcx2r,I’m actually surprised how well the NYC Board ...,1625095688,t1_h3mc382,t3_oarmdv,PoliticalDiscussion,t5_2sfmf,114.0,t2_capqx4tk,The New York Mayoral Race Has Been Thrown Into...,t3_oarmdv,111.0,1362135.0,1.625036e+09,oarmdv,PoliticalDiscussion,t5_2sfmf
