# Import

In [3]:
import pandas as pd
from psaw import PushshiftAPI
import datetime as dt
from tqdm import tqdm
import numpy as np
import warnings
import analysis_util
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
warnings.filterwarnings("ignore")
from scipy.stats import pearsonr

# Data

In [19]:
df=pd.read_csv('/home/pelle/Master_Thesis/data/processed/community_metrics.csv').sort_values('number_of_nodes',ascending=False)
# convert to int in days mean_delta_time
df['mean_delta_time']=df['mean_delta_time'].apply(lambda x: int(x[:3]))

# remove old versions of each subrredit
def remove_old_versions(df):
    df=df.sort_values('version')
    df=df.drop_duplicates(subset=['subreddit'],keep='last')
    return df

df=remove_old_versions(df)
df.head(20)

Unnamed: 0,subreddit,version,period,total_activity,number_of_nodes,number_of_edges,average_weight_of_edges,median_weight_of_edges,average_degree,median_degree,average_clustering_coefficient,mean_activity,mean_delta_time,number_of_reciprocal_edges,fraction_of_reciprocal_edges,clustering_coefficient_p_value
9,FourSentenceStories,2022-11-18 10:14:18,2021-05-04 - 2021-12-28,200,34,28,2.678571,1.0,1.647059,1.0,0.274683,5.823529,19,15,0.428571,0.023
10,Trump666,2022-11-18 10:15:23,2020-03-16 - 2022-01-01,6746,755,1628,2.652948,1.0,4.312583,2.0,0.202507,7.909934,53,1048,0.492944,0.0
11,GraphTheory,2022-11-18 10:18:32,2015-02-17 - 2021-12-26,497,247,163,1.717791,1.0,1.319838,1.0,0.015602,2.012146,59,124,0.563636,0.286
12,jazznoir,2022-11-18 10:19:14,2015-01-01 - 2022-01-01,4725,1497,1139,1.4741,1.0,1.52171,1.0,0.009824,2.985944,122,640,0.453258,0.016
13,indoorbouldering,2022-11-18 11:16:58,2017-08-23 - 2022-01-01,10532,2529,4866,1.665228,1.0,3.848161,2.0,0.061709,3.967181,112,3487,0.53196,0.0
14,DTU,2022-11-18 11:22:59,2015-05-15 - 2022-01-01,2788,689,1196,1.805184,1.0,3.471698,2.0,0.064316,4.046444,136,1014,0.600355,0.0
15,kiwi_bird,2022-11-18 11:24:00,2015-02-21 - 2021-12-27,862,302,301,1.594684,1.0,1.993377,1.0,0.047712,2.778146,65,128,0.357542,0.085


In [3]:
df.rename(columns={'mean_delta_time':'$\overline{\D t}$'},inplace=True)
df.rename(columns={'number_of_nodes':'$N$'},inplace=True)
df.rename(columns={'number_of_edges':'$E$'},inplace=True)
df.rename(columns={'total_activity': '$A$'},inplace=True)
df.rename(columns={'mean_activity': '$\overline{A}$'},inplace=True)
df.rename(columns={'mean_degree': '$\overline{D}$'},inplace=True)
df.rename(columns={'median_degree': '$D_{\mu}$'},inplace=True)
df.rename(columns={'average_clustering_coefficient': '$\overline{C}$'},inplace=True)
df.rename(columns={'average_weight_of_edges': '$\overline{W}$'},inplace=True)
df.rename(columns={'median_weight_oaverage_degreef_edges': '$W_{\mu}$'},inplace=True)
df.rename(columns={'average_degree': '$\overline{D}$'},inplace=True)
df.rename(columns={'number_of_reciprocal_edges': '$R$'},inplace=True)
df.rename(columns={'fraction_of_reciprocal_edges': '$R_{\%}$'},inplace=True)
df.rename(columns={'clustering_coefficient_p_value': '$P_{C}$'},inplace=True)
df.rename(columns={'median_weight_of_edges': '$W_{\mu}$'},inplace=True)

print(df.drop(columns=['version']).round(decimals=3).to_latex(escape=False,index=False))

\begin{tabular}{llrrrrrrrrrrrrr}
\toprule
          subreddit &                period &   $A$ &  $N$ &  $E$ &  $\overline{W}$ &  $W_{\mu}$ &  $\overline{D}$ &  $D_{\mu}$ &  $\overline{C}$ &  $\overline{A}$ &  $\overline{\D t}$ &  $R$ &  $R_{\%}$ &  $P_{C}$ \\
\midrule
   indoorbouldering & 2015-01-01-2022-01-01 & 11653 & 2529 & 4866 &           1.665 &        1.0 &           3.848 &        2.0 &           0.062 &           3.967 &                112 & 3487 &     0.532 &    0.000 \\
           jazznoir & 2015-01-01-2022-01-01 &  7077 & 1497 & 1139 &           1.474 &        1.0 &           1.522 &        1.0 &           0.010 &           2.986 &                122 &  640 &     0.453 &    0.012 \\
           Trump666 & 2015-01-01-2022-01-01 &  7102 &  755 & 1628 &           2.653 &        1.0 &           4.313 &        2.0 &           0.203 &           7.910 &                 53 & 1048 &     0.493 &    0.000 \\
                DTU & 2015-01-01-2022-01-01 &  3274 &  689 & 1196 &          

In [16]:
df_explanation=pd.DataFrame(df['subreddit']).set_index('subreddit')

# df_explanation['DTU']['xx']

# new collumns for explanation
df_explanation['explanation']=0
df_explanation['explanation']['DTU'] = "A subreddit related to the technical university of Denmark DTU"
df_explanation['explanation']['jazznoir'] = ""


df_explanation

Unnamed: 0_level_0,explanation
subreddit,Unnamed: 1_level_1
indoorbouldering,0
jazznoir,0
Trump666,0
DTU,"A research university in Denmark, with a focus..."
kiwi_bird,0
GraphTheory,0
FourSentenceStories,0


# playground

In [2]:
import pandas as pd 
import numpy as np
import os
import sys
from tqdm import tqdm
import praw

print("Reading data...")
sample = pd.read_csv('/home/pelle/Master_Thesis/data/raw/wallstreetbets/submissions_pmaw_2016-2021_wsb.csv',nrows=10)
dtypes = sample.dtypes # Get the dtypes
cols = sample.columns # Get the columns
dtype_dictionary = {} 
for c in cols:
    if str(dtypes[c]) == 'int64':
        dtype_dictionary[c] = 'float32' # Handle NANs in int columns
    else:
        dtype_dictionary[c] = str(dtypes[c])

df_posts = pd.read_csv('/home/pelle/Master_Thesis/data/raw/wallstreetbets/submissions_pmaw_2016-2021_wsb.csv',dtype=dtype_dictionary, 
                keep_default_na=False,
                na_values=['na',''],
                usecols=['id'])

print('Done loading data..' )

p = '/home/pelle/Master_Thesis/data/processed/wallstreetbets_scores/'
# create new dataframe if not already created
if not os.path.exists(p+'df_awards_post.csv'):
    print('Creating new dataframe')
    df_id=pd.DataFrame()
    df_id.to_csv(p+'df_awards_post.csv',index=False)
    len_id = 0
else: 
    df_id=pd.read_csv(p+'df_awards_post.csv')
    len_id=len(df_id)
    print('starting from: ',len_id)
    df_id = None



# Read-only instance
def get_reddit_instance():
    reddit = praw.Reddit(client_id="OlWj7Mu4aXh0eg",
                                client_secret="fIzRhpEeBYAwi8_i2hcyzoWwDnWOag",
                                user_agent="Scrapper")
    return reddit

reddit = get_reddit_instance()

def get_n_awards(id):
    submission_awards = []
    submission = reddit.submission(id=id)
    submission_awards.append(submission.all_awardings)
    # get number of awards
    return [len(x) for x in submission_awards]

print('Lets go!!!')

# Get the awards for a post and append to the dataframe. skip len_id rows
for id in tqdm(df_posts['id'][len_id:]):
    # append to csv file
    N_awards = get_n_awards(id)

    # df_awards = pd.DataFrame({'id':id,'N_awards':N_awards})
    # df_awards.to_csv(p+'df_awards_post.csv',mode='a',header=False,index=False)

Reading data...
Done loading data..
starting from:  5500
Lets go!!!


  0%|          | 2/1226326 [00:11<2041:38:11,  5.99s/it]


KeyboardInterrupt: 

In [71]:
y

0

In [20]:
from time import sleep
import pandas as pd
from tqdm import tqdm
from psaw import PushshiftAPI
import datetime as dt
import warnings
warnings.filterwarnings("ignore")


def get_periods(tstart, tend, interval):
    periods = []

    period_start = tstart
    while period_start < tend:
        period_end = min(period_start + interval, tend)
        periods.append((
            int(period_start.timestamp()), 
            int(period_end.timestamp())
            ))
        period_start = period_end

    return periods


def convert_utc_to_date(df):
    df['date'] = pd.to_datetime(df['created'],unit='s')
    return df

def download_comments(start,end,subreddit,limit):

    def data_prep_comments(subreddit, start_time, end_time, filters, limit):
        if (len(filters) == 0):
            filters = ['id', 'author', 'created_utc',
                    'body', 'subreddit','score','parent_id','post_id','total_awards_received']
                    #We set by default some usefull columns 

        comments = list(api.search_comments(
            subreddit=subreddit,    #Subreddit we want to audit
            after=start_time,       #Start date
            before=end_time,        #End date
            filter=filters,         #Column names we want to retrieve
            limit=limit,
            total_awards_received = ">0"))       #Max number of comments
        return pd.DataFrame([thing.d_ for thing in comments]) #Return dataframe for analysis
    
    print('Setting up API..')
    api = PushshiftAPI()

    periods = get_periods(start, end, dt.timedelta(days=2))

    for period in tqdm(periods):
        df_c = data_prep_comments(subreddit, start_time=period[0], end_time=period[1], filters=[], limit=limit)
        # wait N second to avoid rate limit
        sleep(2)