# TODO

## Data Sections
- Exp-to-Non exp productivity ratio matrix
- Domestic Owned Exporter / Non-Exporter KS [All Ps]
- Exporter / Non-Exporter KS for all sizes [All Ps]
- All Firms Exp / nExp KS [Comparison of Ps]


## Hypotheses
- Exporter >= Non-Exporter [All categories]
- Smaller firms are more sensitive to export status


In [33]:
import pandas as pd
import numpy as np
from scipy.stats import kstest

import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
fmt_df = pd.DataFrame(
    index=[
        "p10",
        "p25",
        "p50",
        "p75",
        "p90",
    ]
)
df = pd.read_csv("data.csv", encoding="utf-8", low_memory=False)
df.columns = df.columns.str.strip()

In [3]:
df.loc[df['Dimension'] == '2. Sizeband']

Unnamed: 0,Year,Metric,Dimension,Category,Status,Source,Mean,SD,p10,p25,p50,p75,p90,N
60,2011,GVA per worker,2. Sizeband,Large (250+),Exporter,ABS,96500,307500,18500,37000,64500,107000,176500,2150
61,2012,GVA per worker,2. Sizeband,Large (250+),Exporter,ABS,97500,255500,19000,37000,64500,107500,182000,2250
62,2013,GVA per worker,2. Sizeband,Large (250+),Exporter,ABS,96000,216500,20000,39000,67000,109500,178500,2400
63,2014,GVA per worker,2. Sizeband,Large (250+),Exporter,ABS,99000,207000,22500,40500,70000,114000,183500,2500
64,2015,GVA per worker,2. Sizeband,Large (250+),Exporter,ABS,97500,217000,24000,41000,68000,110500,184500,2600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2018,GVA per worker,2. Sizeband,Small (10-49),Non-Exporter,ABS-TiG,42500,100000,7000,14500,29000,54000,86000,140000
296,2019,GVA per worker,2. Sizeband,Small (10-49),Non-Exporter,ABS-TiG,43000,120500,6000,14000,30500,55000,89500,139300
297,2020,GVA per worker,2. Sizeband,Small (10-49),Non-Exporter,ABS-TiG,39500,144500,2000,8500,25000,50000,85500,144850
298,2021,GVA per worker,2. Sizeband,Small (10-49),Non-Exporter,ABS-TiG,44500,302000,3000,11000,28000,57500,97500,140700


In [19]:
kstest(
    df.loc[df['Status'] == 'Exporter', 'Mean'],
    df.loc[df['Status'] == 'Non-Exporter', 'Mean']
)

KstestResult(statistic=np.float64(0.6377777777777778), pvalue=np.float64(2.3803919609674216e-172), statistic_location=' 56,000 ', statistic_sign=np.int8(-1))

In [32]:
Qs = ["p10", "p25", "p50", "p75", "p90"]
sizes = ['Small (10-49)', 'Medium (50-249)', 'Large (250+)']
status = ["Exporter", "Non-Exporter"]

exp_nexp_df = fmt_df.copy()
exp_nexp_df['Small (10-49)'] = np.nan
exp_nexp_df['Medium (50-249)'] = np.nan
exp_nexp_df['Large (250+)'] = np.nan

for i, q in enumerate(Qs):
    for size in sizes:
        samp1 = df.loc[(df["Status"] == "Exporter") & (df['Category'] == size), q]
        samp2 = df.loc[(df["Status"] == "Non-Exporter") & (df['Category'] == size), q]
        exp_nexp_df.loc[q, size] = kstest(samp1, samp2).pvalue
print("Exporter vs Non-Exporter KS test per sizeband [PVAL]")
exp_nexp_df.round(6)

Exporter vs Non-Exporter KS test per sizeband [PVAL]


Unnamed: 0,Small (10-49),Medium (50-249),Large (250+)
p10,0.0,0.0,6e-06
p25,0.0,0.0,1e-06
p50,0.0,0.0,1e-06
p75,0.000293,2.4e-05,0.0009
p90,0.0,6e-06,0.0009


In [None]:
sizes = ['Small (10-49)', 'Medium (50-249)', 'Large (250+)']

sizecomp_exp_df = pd.DataFrame(
    columns=sizes,
    index=sizes
)

samp = df.loc[df['Status'] == 'Exporter']
for i in sizes:
    for j in sizes:
        samp1 = samp.loc[samp['Category'] == i, 'p50']
        samp2 = samp.loc[samp['Category'] == j, 'p50']
        sizecomp_exp_df.loc[i, j] = kstest(samp1, samp2).pvalue

print("Exporter Sizeband Comparison KS test [PVAL]")
sizecomp_exp_df.round(6)

Exporter Sizeband Comparison KS test [PVAL]


Unnamed: 0,Small (10-49),Medium (50-249),Large (250+)
Small (10-49),1.0,0.392945,1e-06
Medium (50-249),0.392945,1.0,0.000293
Large (250+),1e-06,0.000293,1.0


In [15]:
sizecomp_nexp_df = pd.DataFrame(
    columns=sizes,
    index=sizes
)

samp = df.loc[df['Status'] == 'Non-Exporter']
for i in sizes:
    for j in sizes:
        samp1 = samp.loc[samp['Category'] == i, 'p50']
        samp2 = samp.loc[samp['Category'] == j, 'p50']
        sizecomp_nexp_df.loc[i, j] = kstest(samp1, samp2).pvalue

print("Non-Exporter Sizeband Comparison KS test [PVAL]")
sizecomp_nexp_df.round(6)

Non-Exporter Sizeband Comparison KS test [PVAL]


Unnamed: 0,Small (10-49),Medium (50-249),Large (250+)
Small (10-49),1.0,1e-06,0.070888
Medium (50-249),1e-06,1.0,0.03458
Large (250+),0.070888,0.03458,1.0


In [28]:
Q_comp_df = pd.DataFrame(
    columns=['Foreign-Owned', 'Domestic-Owned'],
    index=Qs
)
for i in Qs:
    for j in Q_comp_df.columns:
        samp1 = df.loc[(df['Category'] == j) & (df['Status'] == 'Exporter'), i]
        samp2 = df.loc[(df['Category'] == j) & (df['Status'] == 'Non-Exporter'), i]
        Q_comp_df.loc[i, j] = kstest(samp1, samp2).pvalue

print("Exporter vs Non-Exporter KS test per ownership type [PVAL]")
Q_comp_df.round(6)

Exporter vs Non-Exporter KS test per ownership type [PVAL]


Unnamed: 0,Foreign-Owned,Domestic-Owned
p10,6e-06,1e-06
p25,0.0,0.0
p50,0.0,0.0
p75,0.594071,0.0
p90,0.594071,1e-06
