# Data Exploration


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import plotly.express as px
import yfinance as yf
sns.set_theme(style="darkgrid")

In [2]:
df=pd.read_csv('../data/raw/wallstreetbets.csv')
df.columns = ['author','created_utc','domain','id','n_comments','text','title','url','date']

def convert_utc_to_date(df):
    df['date'] = pd.to_datetime(df['created_utc'],unit='s')
    return df

df=convert_utc_to_date(df)
df['date_no_time'] = df['date'].dt.date
df['date_weekday'] = df['date'].dt.weekday.replace(0, 'Monday').replace(1, 'Tuesday').replace(2, 'Wednesday').replace(3, 'Thursday').replace(4, 'Friday').replace(5, 'Saturday').replace(6, 'Sunday')
df['date_week'] = df['date'].dt.month 

  df=pd.read_csv('../data/raw/wallstreetbets.csv')


In [3]:
df.title.replace(np.nan, 'None', inplace=True)
x=df.title.loc[df.title=='[removed]'].count() +df.title.loc[df.title=='[deleted]'].count()

In [4]:
def plot_observations_per_day(df):
    df_grouped = df.groupby(['date_no_time']).count()
    fig=px.histogram(df_grouped, x=df_grouped.index, y=df_grouped.id,nbins=len(df_grouped.index)
    ,labels={
        'date_no_time':'Date', 
        'sum of id':'Number of posts'})
    fig.show()
plot_observations_per_day(df)

In [5]:
def plot_observations_per_day(df):
    df_grouped = df.groupby(['date_weekday']).count()
    fig=px.histogram(df_grouped, x=df_grouped.index, y=df_grouped.id,nbins=len(df_grouped.index)
    ,category_orders={'date_weekday': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']},
    title='Observations per day of the week',labels={'date_weekday':'Day of the week'})
    fig.show()

plot_observations_per_day(df)

In [6]:

df.title = df.title.str.lower() 
df.text = df.text.str.lower() 

df['GME_title'] = df.title.str.contains(' gme ' or ' $gme ' or ' gamestop ') 
df['GME_text'] = df.text.str.contains(' gme ' or ' $gme ' or '   ') 
df['GME'] = df.GME_title | df.GME_text

def plot_gme(df):
    df_grouped = df.groupby(['date_no_time']).GME.sum()
    fig=px.histogram(df_grouped, x=df_grouped.index, 
    y=df_grouped.values,nbins=int(len(df_grouped.index)/7),
    
    labels={'date_no_time':'Date (weeks)'})
    fig.show()

plot_gme(df)

In [13]:
gme=pd.read_csv('../data/raw/gamestop.csv')
gme['date'] = pd.to_datetime(gme['Date'])

def plot_high(df):
    fig=px.line(df, x=df.date, y=df.High)
    fig.show()

plot_high(gme)

# data 2

In [3]:
sample = pd.read_csv('/home/pelle/Downloads/comments_pmaw_2016-2021_wsb.csv',nrows=10)
dtypes = sample.dtypes # Get the dtypes
cols = sample.columns # Get the columns
dtype_dictionary = {} 
for c in cols:
    if str(dtypes[c]) == 'int64':
        dtype_dictionary[c] = 'float32' # Handle NANs in int columns
    else:
        dtype_dictionary[c] = str(dtypes[c])
dtype_dictionary['author'] = 'str'
dtype_dictionary['body'] = 'str'
dtype_dictionary['parent_id'] = 'str'
dtype_dictionary['link_id'] = 'str'
dtype_dictionary['id'] = 'str'


df_comments_chunked = pd.read_csv('/home/pelle/Downloads/comments_pmaw_2016-2021_wsb.csv', dtype=dtype_dictionary, 
                 keep_default_na=False, 
                #  error_bad_lines=False,
                 on_bad_lines='warn',
                 na_values=['na',''],
                 usecols=['author','parent_author','created_utc','score'],chunksize=1000000)
df_comments = pd.concat(df_comments_chunked, ignore_index=True)
df_comments.created_utc = pd.to_datetime(df_comments.created_utc,unit='s')

In [4]:
sample = pd.read_csv('/home/pelle/Downloads/submissions_pmaw_2016-2021_wsb.csv',nrows=10)
dtypes = sample.dtypes # Get the dtypes
cols = sample.columns # Get the columns
dtype_dictionary = {} 
for c in cols:
    if str(dtypes[c]) == 'int64':
        dtype_dictionary[c] = 'float32' # Handle NANs in int columns
    else:
        dtype_dictionary[c] = str(dtypes[c])

df_posts = pd.read_csv('/home/pelle/Downloads/submissions_pmaw_2016-2021_wsb.csv',dtype=dtype_dictionary, 
                keep_default_na=False,
                na_values=['na',''],
                usecols=['author','created_utc','score'])
                
df_posts.created_utc = pd.to_datetime(df_posts.created_utc,unit='s')

In [5]:
df_all=pd.concat([df_posts[['created_utc','author']],df_comments[['created_utc','author']]])

In [6]:
def plot_post_comment_over_time(d):
    d=d['created_utc'].apply(lambda x: x.created_utc()).value_counts()
    plt.figure(figsize=(22,5),dpi=200)
    d.sort_index(inplace=True)
    d=d.asfreq('D')
    plt.hist(d.index, weights=d.values, bins=int(len(d.values)), color='blue',edgecolor='none')
    # plt.xlim(d.index.min(),datetime.date(2022,1,1))
    plt.title('wallstreetbets',fontsize=20)
    plt.xlabel('Date',fontsize=20)
    plt.ylabel('N posts/comments',fontsize=20)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.title('N posts/comments per day',fontsize=20)
    # p = '/home/pelle/Master_Thesis/reports/figures'
    # plt.savefig(p + '/'+ 'wallstreetbets' + '_activity_time' + '.png', bbox_inches='tight')
    plt.show()
    print('wallstreetbets', 'has', d.sum(), 'posts/comments')
    # print('wallstreetbets', 'has', len(df_comment_dict['wallstreetbets']), 'comments')
    # print('wallstreetbets', 'has', len(df_post_dict['wallstreetbets']), 'posts')
    # print('The average number of posts/comments per day is', round(d.mean(),2))
    # print('start date: ' + str(d.index.min()))
    # print('end date: ' + str(d.index.max()))

plot_post_comment_over_time(df_all)

: 

: 