# Create summaries of what was scraped

Goal of this simple script, is to get a summary of what was actually scraped

This script is written by Gunnar Sjúrðarson Knudsen at TU Wien on 2022-05-27

Data is used in relevance to courses 330.214 Project & Enterprise Financing, as well as 194.060 Interdisciplinary Project in collaboration with Professor Aussenegg.


## Setup

### Load required libraries

Cleanse - Quite certain most aren't neeed

In [1]:
# Standard
import pandas as pd
import numpy as np
import pickle

# Time Cleaning
import time
import datetime
from dateutil.relativedelta import relativedelta

# Scraping
import requests
import locale
from pandas.io.json import json_normalize
import io

from os.path import exists

# Cleansing
import pandas as pd
import locale

import os
from os import listdir
from os.path import isfile, join

from IPython.display import clear_output, display

## Count from Nasdaq

In [2]:
PROCESSED_DATA_LOCATION = 'processed_data/nasdaq/'

tickers = []
trade_counts = []
min_filing_date = []
max_filing_date = []
min_trade_date = []
max_trade_date = []
n_distinct_traders = []
n_distinct_trade_types = []

n_p = []
n_s = []
n_s2 = []
n_a = []
n_d = []
n_g = []
n_f = []
n_m = []
n_x = []
n_c = []
n_w = []

counter = 0
for file in os.listdir(PROCESSED_DATA_LOCATION):
    counter = counter + 1
    filename = os.fsdecode(file)
    clear_output(wait=True)
    print(f'Reading file {str(counter)}: {filename}')
    
    if filename.endswith(".csv"): 
        dat = pd.read_csv(os.path.join(PROCESSED_DATA_LOCATION, filename), header='infer', index_col=0)
        
        tickers.append(filename[:-4])
        trade_counts.append(dat.shape[0])
                
        min_filing_date.append(dat['FilingDate'].min())
        max_filing_date.append(dat['FilingDate'].max())
        
        min_trade_date.append(dat['TradeDate'].min())
        max_trade_date.append(dat['TradeDate'].max())
        
        n_distinct_traders.append(dat['InsiderName'].nunique())
        n_distinct_trade_types.append(dat['TradeType'].nunique())
        
        
        n_p.append(sum(dat['TradeType'] == 'P - Purchase'))
        n_s.append(sum(dat['TradeType'] == 'S - Sale'))
        n_s2.append(sum(dat['TradeType'] == 'S - Sale+OE'))

        n_a.append(sum(dat['TradeType'] == 'A - Grant'))
        n_d.append(sum(dat['TradeType'] == 'D - Sale to Iss') + sum(dat['TradeType'] == 'D - Sale to issuer'))
        n_g.append(sum(dat['TradeType'] == 'G - Gift'))
        n_f.append(sum(dat['TradeType'] == 'F - Tax'))
        n_m.append(sum(dat['TradeType'] == 'M - Option Ex') + sum(dat['TradeType'] == 'M - OptEx'))
        n_x.append(sum(dat['TradeType'] == 'X - Option Ex') + sum(dat['TradeType'] == 'X - OptEx'))
        n_c.append(sum(dat['TradeType'] == 'C - Cnv Deriv') + sum(dat['TradeType'] == 'C - Converted deriv'))
        n_w.append(sum(dat['TradeType'] == 'W - Inherited'))
        
        continue
    else:
        continue
        
print('Done reading files. saving to csv')
res = pd.DataFrame({'tickers': tickers
                    , 'trade_count': trade_counts
                    , 'min_filing_date': min_filing_date
                    , 'max_filing_date': max_filing_date
                    , 'min_trade_date': min_trade_date
                    , 'max_trade_date': max_trade_date
                    , 'n_distinct_traders': n_distinct_traders
                    , 'n_distinct_trade_types': n_distinct_trade_types
                    
                    , 'P - Purchase (count)': n_p
                    , 'S - Sale (count)': n_s
                    , 'S - Sale+OE': n_s2
                    , 'A - Grant (count)': n_a
                    , 'D - Sale to Iss (count)': n_d
                    , 'G - Gift (count)': n_g
                    , 'F - Tax (count)': n_f
                    , 'M - Option Ex (count)': n_m
                    , 'X - Option Ex (count)': n_x
                    , 'C - Cnv Deriv (count)': n_c
                    , 'W - Inherited (count)': n_w
                   })
res.to_csv('processed_data/nasdaq_summary.csv')
print('Done saving to CSV')
display(res)

Reading file 7318: VANS.csv
Done reading files. saving to csv
Done saving to CSV


Unnamed: 0,tickers,trade_count,min_filing_date,max_filing_date,min_trade_date,max_trade_date,n_distinct_traders,n_distinct_trade_types,P - Purchase (count),S - Sale (count),S - Sale+OE,A - Grant (count),D - Sale to Iss (count),G - Gift (count),F - Tax (count),M - Option Ex (count),X - Option Ex (count),C - Cnv Deriv (count),W - Inherited (count)
0,ORN,217,2008-10-02 16:30:57,2022-05-23 17:22:52,2008-09-08,2022-05-19,16,7,23,26,12,126,0,0,18,7,5,0,0
1,HOUR,0,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PCT,43,2021-05-26 16:41:22,2022-05-13 06:05:22,2021-05-24,2022-05-11,16,5,8,1,0,26,0,3,5,0,0,0,0
3,SSTRF,0,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0
4,CTBC,0,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7313,EDEN,41,2003-10-30 12:37:40,2009-06-08 16:58:49,2003-10-28,2009-06-05,11,4,33,6,1,1,0,0,0,0,0,0,0
7314,HWEL,0,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0
7315,WTNY,395,2003-08-12 09:45:43,2011-03-03 16:10:30,2003-08-11,2011-03-01,35,7,74,6,36,101,0,38,71,69,0,0,0
7316,TLRK,5,2003-11-03 19:09:27,2004-02-19 18:43:31,2003-11-03,2004-02-18,2,3,1,2,0,0,0,2,0,0,0,0,0


### Look at S&P500

In [4]:
PROCESSED_DATA_LOCATION = 'processed_data/sop500/'


tickers = []
trade_counts = []
min_filing_date = []
max_filing_date = []
min_trade_date = []
max_trade_date = []
n_distinct_traders = []
n_distinct_trade_types = []

n_p = []
n_s = []
n_s2 = []
n_a = []
n_d = []
n_g = []
n_f = []
n_m = []
n_x = []
n_c = []
n_w = []

counter = 0
for file in os.listdir(PROCESSED_DATA_LOCATION):
    counter = counter + 1
    filename = os.fsdecode(file)
    clear_output(wait=True)
    print(f'Reading file {str(counter)}: {filename}')
    
    if filename.endswith(".csv"): 
        dat = pd.read_csv(os.path.join(PROCESSED_DATA_LOCATION, filename), header='infer', index_col=0)
        
        tickers.append(filename[:-4])
        trade_counts.append(dat.shape[0])
                
        min_filing_date.append(dat['FilingDate'].min())
        max_filing_date.append(dat['FilingDate'].max())
        
        min_trade_date.append(dat['TradeDate'].min())
        max_trade_date.append(dat['TradeDate'].max())
        
        n_distinct_traders.append(dat['InsiderName'].nunique())
        n_distinct_trade_types.append(dat['TradeType'].nunique())
        
        
        n_p.append(sum(dat['TradeType'] == 'P - Purchase'))
        n_s.append(sum(dat['TradeType'] == 'S - Sale'))
        n_s2.append(sum(dat['TradeType'] == 'S - Sale+OE'))

        n_a.append(sum(dat['TradeType'] == 'A - Grant'))
        n_d.append(sum(dat['TradeType'] == 'D - Sale to Iss') + sum(dat['TradeType'] == 'D - Sale to issuer'))
        n_g.append(sum(dat['TradeType'] == 'G - Gift'))
        n_f.append(sum(dat['TradeType'] == 'F - Tax'))
        n_m.append(sum(dat['TradeType'] == 'M - Option Ex') + sum(dat['TradeType'] == 'M - OptEx'))
        n_x.append(sum(dat['TradeType'] == 'X - Option Ex') + sum(dat['TradeType'] == 'X - OptEx'))
        n_c.append(sum(dat['TradeType'] == 'C - Cnv Deriv') + sum(dat['TradeType'] == 'C - Converted deriv'))
        n_w.append(sum(dat['TradeType'] == 'W - Inherited'))
        
        continue
    else:
        continue
        
print('Done reading files. saving to csv')
res = pd.DataFrame({'tickers': tickers
                    , 'trade_count': trade_counts
                    , 'min_filing_date': min_filing_date
                    , 'max_filing_date': max_filing_date
                    , 'min_trade_date': min_trade_date
                    , 'max_trade_date': max_trade_date
                    , 'n_distinct_traders': n_distinct_traders
                    , 'n_distinct_trade_types': n_distinct_trade_types
                    
                    , 'P - Purchase (count)': n_p
                    , 'S - Sale (count)': n_s
                    , 'S - Sale+OE': n_s2
                    , 'A - Grant (count)': n_a
                    , 'D - Sale to Iss (count)': n_d
                    , 'G - Gift (count)': n_g
                    , 'F - Tax (count)': n_f
                    , 'M - Option Ex (count)': n_m
                    , 'X - Option Ex (count)': n_x
                    , 'C - Cnv Deriv (count)': n_c
                    , 'W - Inherited (count)': n_w
                   })

res.to_csv('processed_data/sop500_summary.csv')
print('Done saving to CSV')
display(res)

Reading file 957: RCL.csv
Done reading files. saving to csv
Done saving to CSV


Unnamed: 0,tickers,trade_count,min_filing_date,max_filing_date,min_trade_date,max_trade_date,n_distinct_traders,n_distinct_trade_types,P - Purchase (count),S - Sale (count),S - Sale+OE,A - Grant (count),D - Sale to Iss (count),G - Gift (count),F - Tax (count),M - Option Ex (count),X - Option Ex (count),C - Cnv Deriv (count),W - Inherited (count)
0,MFE,310,2003-12-23 17:28:23,2011-02-25 17:32:50,2002-02-21,2011-02-24,27,8,2,16,74,77,1,5,61,74,0,0,0
1,CCL,1648,2003-08-01 10:46:45,2022-04-12 11:43:46,2000-02-24,2022-04-08,56,11,17,1017,65,332,5,48,83,76,1,3,1
2,EOP,215,2003-08-07 13:36:27,2007-01-24 15:41:34,2002-06-24,2007-01-22,25,8,0,9,20,80,54,3,31,17,0,1,0
3,STR,1072,2003-07-30 18:07:56,2016-09-14 17:30:29,2003-07-29,2016-09-13,45,8,10,19,228,236,4,25,302,248,0,0,0
4,FLT,980,2010-12-16 09:03:48,2022-03-30 22:00:12,2010-12-14,2022-03-28,43,9,11,257,198,212,4,9,61,221,0,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,SLM,1087,2003-08-21 15:01:42,2022-04-22 17:02:08,2003-08-19,2022-04-20,59,9,55,90,59,389,7,45,342,99,1,0,0
953,UNM,1233,2003-12-12 08:32:45,2022-05-13 18:01:56,2003-12-10,2022-05-11,53,9,9,167,39,604,15,28,307,61,0,0,3
954,SRCL,850,2003-08-06 10:12:54,2022-05-03 16:15:09,2003-08-05,2022-05-02,46,8,27,77,164,9,0,57,153,358,0,5,0
955,AAPL,1585,2003-09-29 20:48:06,2022-05-13 18:31:28,2003-09-26,2022-05-06,44,7,6,206,347,53,0,81,267,625,0,0,0
