# Import

In [2]:
import pandas as pd
from psaw import PushshiftAPI
import datetime as dt
from tqdm import tqdm
import numpy as np
import warnings
import analysis_util
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
warnings.filterwarnings("ignore")
from scipy.stats import pearsonr

# Data

In [3]:
df=pd.read_csv('/home/pelle/Master_Thesis/data/processed/community_metrics.csv').sort_values('number_of_nodes',ascending=False)
# convert to int in days mean_delta_time
df['mean_delta_time']=df['mean_delta_time'].apply(lambda x: int(x[:3]))

# remove old versions of each subrredit
def remove_old_versions(df):
    df=df.sort_values('version')
    df=df.drop_duplicates(subset=['subreddit'],keep='last')
    return df

df=remove_old_versions(df)
df.head(20)

Unnamed: 0,subreddit,version,period,total_activity,number_of_nodes,number_of_edges,average_weight_of_edges,median_weight_of_edges,average_degree,median_degree,average_clustering_coefficient,mean_activity,mean_delta_time,number_of_reciprocal_edges,fraction_of_reciprocal_edges,clustering_coefficient_p_value
9,FourSentenceStories,2022-11-18 10:14:18,2021-05-04 - 2021-12-28,200,34,28,2.678571,1.0,1.647059,1.0,0.274683,5.823529,19,15,0.428571,0.023
10,Trump666,2022-11-18 10:15:23,2020-03-16 - 2022-01-01,6746,755,1628,2.652948,1.0,4.312583,2.0,0.202507,7.909934,53,1048,0.492944,0.0
11,GraphTheory,2022-11-18 10:18:32,2015-02-17 - 2021-12-26,497,247,163,1.717791,1.0,1.319838,1.0,0.015602,2.012146,59,124,0.563636,0.286
12,jazznoir,2022-11-18 10:19:14,2015-01-01 - 2022-01-01,4725,1497,1139,1.4741,1.0,1.52171,1.0,0.009824,2.985944,122,640,0.453258,0.016
13,indoorbouldering,2022-11-18 11:16:58,2017-08-23 - 2022-01-01,10532,2529,4866,1.665228,1.0,3.848161,2.0,0.061709,3.967181,112,3487,0.53196,0.0
14,DTU,2022-11-18 11:22:59,2015-05-15 - 2022-01-01,2788,689,1196,1.805184,1.0,3.471698,2.0,0.064316,4.046444,136,1014,0.600355,0.0
15,kiwi_bird,2022-11-18 11:24:00,2015-02-21 - 2021-12-27,862,302,301,1.594684,1.0,1.993377,1.0,0.047712,2.778146,65,128,0.357542,0.085


In [4]:
df.rename(columns={'mean_delta_time':'$\overline{\D t}$'},inplace=True)
df.rename(columns={'number_of_nodes':'$N$'},inplace=True)
df.rename(columns={'number_of_edges':'$E$'},inplace=True)
df.rename(columns={'total_activity': '$A$'},inplace=True)
df.rename(columns={'mean_activity': '$\overline{A}$'},inplace=True)
df.rename(columns={'mean_degree': '$\overline{D}$'},inplace=True)
df.rename(columns={'median_degree': '$D_{\mu}$'},inplace=True)
df.rename(columns={'average_clustering_coefficient': '$\overline{C}$'},inplace=True)
df.rename(columns={'average_weight_of_edges': '$\overline{W}$'},inplace=True)
df.rename(columns={'median_weight_oaverage_degreef_edges': '$W_{\mu}$'},inplace=True)
df.rename(columns={'average_degree': '$\overline{D}$'},inplace=True)
df.rename(columns={'number_of_reciprocal_edges': '$R$'},inplace=True)
df.rename(columns={'fraction_of_reciprocal_edges': '$R_{\%}$'},inplace=True)
df.rename(columns={'clustering_coefficient_p_value': '$P_{C}$'},inplace=True)
df.rename(columns={'median_weight_of_edges': '$W_{\mu}$'},inplace=True)

print(df.drop(columns=['version']).round(decimals=3).to_latex(escape=False,index=False))

\begin{tabular}{llrrrrrrrrrrrrr}
\toprule
          subreddit &                  period &   $A$ &  $N$ &  $E$ &  $\overline{W}$ &  $W_{\mu}$ &  $\overline{D}$ &  $D_{\mu}$ &  $\overline{C}$ &  $\overline{A}$ &  $\overline{\D t}$ &  $R$ &  $R_{\%}$ &  $P_{C}$ \\
\midrule
FourSentenceStories & 2021-05-04 - 2021-12-28 &   200 &   34 &   28 &           2.679 &        1.0 &           1.647 &        1.0 &           0.275 &           5.824 &                 19 &   15 &     0.429 &    0.023 \\
           Trump666 & 2020-03-16 - 2022-01-01 &  6746 &  755 & 1628 &           2.653 &        1.0 &           4.313 &        2.0 &           0.203 &           7.910 &                 53 & 1048 &     0.493 &    0.000 \\
        GraphTheory & 2015-02-17 - 2021-12-26 &   497 &  247 &  163 &           1.718 &        1.0 &           1.320 &        1.0 &           0.016 &           2.012 &                 59 &  124 &     0.564 &    0.286 \\
           jazznoir & 2015-01-01 - 2022-01-01 &  4725 & 1497 & 1139 &

In [16]:
df_explanation=pd.DataFrame(df['subreddit']).set_index('subreddit')

# df_explanation['DTU']['xx']

# new collumns for explanation
df_explanation['explanation']=0
df_explanation['explanation']['DTU'] = "A subreddit related to the technical university of Denmark DTU"
df_explanation['explanation']['jazznoir'] = ""


df_explanation

Unnamed: 0_level_0,explanation
subreddit,Unnamed: 1_level_1
indoorbouldering,0
jazznoir,0
Trump666,0
DTU,"A research university in Denmark, with a focus..."
kiwi_bird,0
GraphTheory,0
FourSentenceStories,0


# playground

In [5]:
import pandas as pd 
import numpy as np
import os
import sys
from tqdm import tqdm
import praw

print("Reading data...")
sample = pd.read_csv('/home/pelle/Master_Thesis/data/raw/wallstreetbets/submissions_pmaw_2016-2021_wsb.csv',nrows=10)
dtypes = sample.dtypes # Get the dtypes
cols = sample.columns # Get the columns
dtype_dictionary = {} 
for c in cols:
    if str(dtypes[c]) == 'int64':
        dtype_dictionary[c] = 'float32' # Handle NANs in int columns
    else:
        dtype_dictionary[c] = str(dtypes[c])

df_posts = pd.read_csv('/home/pelle/Master_Thesis/data/raw/wallstreetbets/submissions_pmaw_2016-2021_wsb.csv',dtype=dtype_dictionary, 
                keep_default_na=False,
                na_values=['na',''],
                usecols=['id'])

print('Done loading data..' )

len(df_posts)

Reading data...
Done loading data..


1231826

In [39]:
p = '/home/pelle/Master_Thesis/data/awards/wallstreetbets/'
files = os.listdir(p)

df_awards = pd.DataFrame()

for f in files:
    df = pd.read_csv(p+f,error_bad_lines=False,header=None, names=['id','award_count'])

    df_awards = pd.concat([df_awards,df],ignore_index=True)

print(len(df_awards))
df_awards = df_awards.drop_duplicates(subset=['id'])
print(len(df_awards))

b'Skipping line 103261: expected 2 fields, saw 3\nSkipping line 103715: expected 2 fields, saw 3\n'
b'Skipping line 105563: expected 2 fields, saw 3\nSkipping line 224290: expected 2 fields, saw 3\n'
b'Skipping line 174197: expected 2 fields, saw 3\nSkipping line 177835: expected 2 fields, saw 3\n'


970329
701367


b'Skipping line 104652: expected 2 fields, saw 3\nSkipping line 223379: expected 2 fields, saw 3\n'


In [43]:
df_awards.loc[df_awards['award_count'] > 0]

Unnamed: 0,id,award_count
167,l8tfi8,3.0
204,l8tf06,1.0
244,l8tu2m,2.0
335,l8tsv4,7.0
379,l8s1h4,2.0
...,...,...
745906,l7ge7g,2.0
852521,go07dw,1.0
968991,fsypwh,1.0
969034,fe5s7e,60.0


In [20]:
from time import sleep
import pandas as pd
from tqdm import tqdm
from psaw import PushshiftAPI
import datetime as dt
import warnings
warnings.filterwarnings("ignore")


def get_periods(tstart, tend, interval):
    periods = []

    period_start = tstart
    while period_start < tend:
        period_end = min(period_start + interval, tend)
        periods.append((
            int(period_start.timestamp()), 
            int(period_end.timestamp())
            ))
        period_start = period_end

    return periods


def convert_utc_to_date(df):
    df['date'] = pd.to_datetime(df['created'],unit='s')
    return df

def download_comments(start,end,subreddit,limit):

    def data_prep_comments(subreddit, start_time, end_time, filters, limit):
        if (len(filters) == 0):
            filters = ['id', 'author', 'created_utc','total_awards_received']
                    #We set by default some usefull columns 

        comments = list(api.search_comments(
            subreddit=subreddit,    #Subreddit we want to audit
            after=start_time,       #Start date
            before=end_time,        #End date
            filter=filters,         #Column names we want to retrieve
            limit=limit,
            total_awards_received = ">0"))       #Max number of comments
        return pd.DataFrame([thing.d_ for thing in comments]) #Return dataframe for analysis
    
    print('Setting up API..')
    api = PushshiftAPI()

    periods = get_periods(start, end, dt.timedelta(days=2))

    for period in tqdm(periods):
        df_c = data_prep_comments(subreddit, start_time=period[0], end_time=period[1], filters=[], limit=limit)
        # wait N second to avoid rate limit
        sleep(1)

In [7]:
from threading import current_thread
from time import sleep
import pandas as pd
from tqdm import tqdm
from psaw import PushshiftAPI
import datetime as dt
import warnings
warnings.filterwarnings("ignore")

def convert_utc_to_date(df):
    df['date'] = pd.to_datetime(df['date'],unit='s')
    return df

def data_prep_posts(subreddit, start_time, end_time,  limit,api):
    filters = ['id', 'author', 'created_utc','total_awards_received']              
                #We set by default some useful columns

    posts = list(api.search_submissions(
        subreddit=subreddit,   #Subreddit we want to audit
        after=start_time,      #Start date
        before=end_time,       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit))      #Max number of comments))          

    return pd.DataFrame([thing.d_ for thing in posts])


def download_posts(start,end,subreddit,folder_name,file_name,limit,check_point):
    # if file_name == None: file_name = subreddit

    # if check_point:
    #     print('Continuing from last checkpoint..')
    #     current_df = pd.read_csv("./data/raw/" + folder_name + '/' + file_name + ".csv")
    #     current_df.columns = ['author','created_utc','domain','id','n_comments','score','text','title','url','date']
    #     current_df.drop(current_df.loc[current_df['date'].apply(lambda x: isinstance(x, str))].index, inplace=True)
    #     current_df = convert_utc_to_date(current_df)
    #     start = current_df.date.max()
    # else: 
    #     print('Starting from scratch..')
    #     pd.DataFrame().to_csv("./data/raw/" + folder_name + '/' + file_name + ".csv", index=False, header=False)
    
    delta = end - start
    # print('Downloading to..: ', file_name+'.csv')
    # print('Start date: ' + str(start))
    # print('End date: ' + str(end))
    # print('Subreddit: ' + subreddit)
    
    print('Setting up API..')
    api = PushshiftAPI()

    print('Starting..') 
    for d in tqdm(range(delta.days + 1)):
        d_1=dt.timedelta(days=1)
        d_n=dt.timedelta(days=d+1)
        start_get=int((start+d_n-d_1).timestamp())
        end_get=int((start+d_n).timestamp())
        
        df=data_prep_posts(subreddit,start_get,end_get,limit,api)
        # df.to_csv("./data/raw/" + folder_name + '/' +  file_name + ".csv", mode='a', index=False, header=False)
        # wait N second to avoid rate limit
        sleep(0.2)

In [8]:
start=dt.datetime(year=2015, month=1, day=1)
end=dt.datetime(year=2015, month=1, day=2)
api = PushshiftAPI()

df=data_prep_posts('DTU',start,end,None,api)

Exception: Unable to connect to pushshift.io. Max retries exceeded.

In [80]:
from urllib import request
url = "https://www.reddit.com/r/pics/comments/zp5c42/elon_musk_hanging_out_with_jared_kushner_at_the/"

response = request.urlopen(url)
# set the correct charset below
page_source = response.read().decode('utf-8')
type(page_source)

str

In [82]:
import requests
req = requests.get(url, 'html.parser')

In [98]:
from selenium import webdriver
import bs4
import re
from time import sleep

url = "https://www.reddit.com/r/pics/comments/zp5c42/elon_musk_hanging_out_with_jared_kushner_at_the/"

driver_location = '/usr/bin/chromedriver'
binary_location = '/usr/bin/google-chrome'

options = webdriver.ChromeOptions()
options.binary_location = binary_location

driver = webdriver.Chrome(executable_path=driver_location,options=options)
driver.get(url)

innerHTML = driver.execute_script("return document.body.innerHTML")
##print(driver.page_source)

sleep(1)
root=bs4.BeautifulSoup(innerHTML,"lxml")
viewcount=root.find_all("span",attrs={'class':'short-view-count style-scope yt-view-count-renderer'})


# for span in viewcount:
#     print(span.string)

driver.quit()

TypeError: 'NoneType' object is not callable

In [107]:
import re
reg=re.findall(r"(?<=data-count=.)\d", innerHTML)
reg

[]