In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scripts import utils
from matplotlib.ticker import MultipleLocator

In [None]:
os.chdir("../../work/ktarlind")

In [None]:
min_market_cap_percentile_na = 0.6
min_market_cap_percentile_global = 0.65
volume_usd_5_min = 1000000

In [None]:
data_mc_stats = pd.DataFrame(data=[], columns=["avg_mc", "bot_5_pct_mc", "avg_mc_cap", "min_mc_cap", "bot_5_pct_mc_cap", 
                                               "avg_mc_g", "bot_5_pct_mc_g", "avg_mc_cap_g", "min_mc_cap_g", "bot_5_pct_mc_cap_g"])
data_n_stats = pd.DataFrame(data=[], columns=["num", "num_cap_100", "num_cap_50", "num_cap_20",
                                             "num_g", "num_cap_100_g", "num_cap_50_g", "num_cap_20_g"])

In [None]:
for year in range(2001,2024):
    na_data = pd.read_parquet(f'data/na_data_{year}only_processed.parquet', engine='pyarrow')
    avg_mc = na_data["market_cap_usd"].median()
    bot_5_pct_mc = na_data["market_cap_usd"].quantile(0.05).mean()
    num = na_data["gvkey"].value_counts()[na_data["gvkey"].value_counts() >= 50].shape[0]
    
    na_data_capped = na_data.groupby("date").apply(lambda x: x[x["market_cap_usd"] > x["market_cap_usd"].quantile(min_market_cap_percentile_na)]).reset_index(drop=True)
    na_data_capped = na_data_capped[na_data_capped["volume_usd_5"] > volume_usd_5_min]
    avg_mc_cap = na_data_capped["market_cap_usd"].median()
    min_mc_cap = na_data_capped.groupby("date")["market_cap_usd"].min().median()
    bot_5_pct_mc_cap = na_data_capped["market_cap_usd"].quantile(0.05).mean()
    num_cap_100 = na_data_capped["gvkey"].value_counts()[na_data_capped["gvkey"].value_counts() >= 100].shape[0]
    num_cap_50 = na_data_capped["gvkey"].value_counts()[na_data_capped["gvkey"].value_counts() >= 50].shape[0]
    num_cap_20 = na_data_capped["gvkey"].value_counts()[na_data_capped["gvkey"].value_counts() >= 20].shape[0]
    
    global_data = pd.read_parquet(f'data/global_data_{year}only_processed.parquet', engine='pyarrow')
    avg_mc_g = global_data["market_cap_usd"].median()
    bot_5_pct_mc_g = na_data["market_cap_usd"].quantile(0.05).mean()
    num_g = global_data["gvkey"].value_counts()[global_data["gvkey"].value_counts() >= 50].shape[0]
    
    global_data_capped = global_data.groupby("date").apply(lambda x: x[x["market_cap_usd"] > x["market_cap_usd"].quantile(min_market_cap_percentile_global)]).reset_index(drop=True)
    global_data_capped = global_data_capped[global_data_capped["volume_usd_5"] > volume_usd_5_min]
    avg_mc_cap_g = global_data_capped["market_cap_usd"].median()
    min_mc_cap_g = global_data_capped.groupby("date")["market_cap_usd"].min().median()
    num_cap_100_g = global_data_capped["gvkey"].value_counts()[global_data_capped["gvkey"].value_counts() >= 100].shape[0]
    num_cap_50_g = global_data_capped["gvkey"].value_counts()[global_data_capped["gvkey"].value_counts() >= 50].shape[0]
    num_cap_20_g = global_data_capped["gvkey"].value_counts()[global_data_capped["gvkey"].value_counts() >= 20].shape[0]
    
    data_mc_stats.loc[year] = [avg_mc, bot_5_pct_mc, avg_mc_cap, min_mc_cap, bot_5_pct_mc_cap,
                              avg_mc_g, bot_5_pct_mc_g, avg_mc_cap_g, min_mc_cap_g, bot_5_pct_mc_cap_g]
    data_n_stats.loc[year] = [num, num_cap_100, num_cap_50, num_cap_20,
                             num_g, num_cap_100_g, num_cap_50_g, num_cap_20_g]
    


In [None]:
data_n_stats["num_cap_tot_100"] = data_n_stats["num_cap_100"] + data_n_stats["num_cap_100_g"]
data_n_stats["num_cap_tot_50"] = data_n_stats["num_cap_50"] + data_n_stats["num_cap_50_g"]
data_n_stats["num_cap_tot_20"] = data_n_stats["num_cap_20"] + data_n_stats["num_cap_20_g"]

In [None]:
mc_plot = data_mc_stats[["avg_mc_cap", "avg_mc_cap_g", "min_mc_cap", "min_mc_cap_g"]].plot(figsize=(18, 8), logy=False)
mc_plot.lines[0].set(label="Median, NA data", linewidth=3)
mc_plot.lines[1].set(label="Median, ROW data", linewidth=3)
mc_plot.lines[2].set(label="Median of daily min, NA data", linewidth=3)
mc_plot.lines[3].set(label="Median of daily min, ROW data", linewidth=3)
vals = mc_plot.get_yticks()
mc_plot.set_yticklabels([(str(int(x / 1000000)) + " M USD") for x in vals])

mc_plot.xaxis.set_minor_locator(MultipleLocator(1))
plt.grid()
mc_plot.set_xlabel("Year", fontsize=16)
mc_plot.set_ylabel("Market Cap", fontsize=16)
mc_plot.tick_params(axis='both', which='major', labelsize=14)
mc_plot.legend(fontsize=16)
mc_plot.set_xlim(2000, 2024)
#mc_plot.figure.savefig("figures/MarketCapData.pdf", dpi=1000, bbox_inches='tight')

In [None]:
mc_plot = data_n_stats[["num_cap_tot_100", 
                        #"num_cap_tot_50", 
                        "num_cap_tot_20", 
                        "num_cap_100_g", 
                        #"num_cap_50", 
                        "num_cap_20_g", 
                        "num_cap_100", 
                        #"num_cap_50_g", 
                        "num_cap_20"]].plot(figsize=(18, 8), logy=False)
mc_plot.lines[0].set(label="Global data (min. 100 days)", linewidth=3)
mc_plot.lines[1].set(label="Global data (min. 20 days)", linewidth=3)
mc_plot.lines[2].set(label="ROW data (min. 100 days)", linewidth=3)
mc_plot.lines[3].set(label="ROW data (min. 20 days)", linewidth=3)
mc_plot.lines[4].set(label="NA data (min. 100 days)", linewidth=3)
mc_plot.lines[5].set(label="NA data (min. 20 days)", linewidth=3)


mc_plot.xaxis.set_minor_locator(MultipleLocator(1))
plt.grid()
mc_plot.set_xlabel("Year", fontsize=16)
mc_plot.set_ylabel("Unique securities", fontsize=16)
mc_plot.tick_params(axis='both', which='major', labelsize=14)
mc_plot.legend(fontsize=16)
mc_plot.set_ylim(1000, 14000)
#mc_plot.figure.savefig("figures/MarketCapDataN.pdf", dpi=1000, bbox_inches='tight')