In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import OLS
from linearmodels import PanelOLS
from linearmodels.iv.model import IV2SLS
from linearmodels.panel.results import compare

data_dir = "/Users/mac/Desktop/Study/Diploma/data"

In [2]:
def return_unique(x):
    return len(x.unique())

In [3]:
spark_path = os.path.join(data_dir, "spark/nxt_spark_data.parquet")
gtd_path = os.path.join(data_dir, "gtd/gtd_processed")
iv_path = os.path.join(data_dir, "instrument/iv.parquet")

os.listdir(gtd_path)
spark_df = pd.read_parquet(spark_path)
            
spark_df = spark_df.loc[~spark_df["okved_four"].isin(['nan', 'None'])]
iv_df = pd.read_parquet(iv_path)\
        .assign(instrument=lambda x: x["weight_c"] * x["tariff"])


result = []
for _, item_df in tqdm(iv_df.groupby(["okved_four", "product"])):
    item_df = item_df.sort_values(by=["year"])
    item_df = item_df.assign(delta_t=lambda x: x.tariff.diff(1))
    result.append(item_df)

iv_df = pd.concat(result) # .assign(instrument=lambda x: x["weight"] * x["delta_t"])

iv_df = iv_df.groupby(["okved_four", "year"]).instrument.sum().reset_index()

tables = os.listdir(gtd_path)

gtd_df = []
for table in tables:
    df = pd.read_parquet(os.path.join(gtd_path, table))
    df = df.loc[df["INN"] > 100]\
            .groupby(["INN", "year"]).agg({"code": return_unique})\
            .reset_index().rename(columns={"code": "num_countries"})
    gtd_df.append(df)
    
gtd_df = pd.concat(gtd_df)

print(len(spark_df), len(gtd_df), len(iv_df))

df = spark_df.merge(
    gtd_df,
    left_on=["INN", "Year"],
    right_on=["INN", "year"],
    how="left"
).drop(columns=["year"])

df = df.merge(
    iv_df,
    left_on=["okved_four", "Year"],
    right_on=["okved_four", "year"],
    how="inner"
).drop(columns=["year"])

100%|██████████| 43094/43094 [00:09<00:00, 4622.23it/s]


2335164 139420 2397


In [4]:
filter_cond = (df.assets > 0.)

data = df.loc[filter_cond]\
        .sort_values(by=['INN', 'Year'])\
        .assign(
            short_leverage=lambda x: x.short_debt / x.assets, 
            long_leverage=lambda x: x.long_debt / x.assets, 
            leverage=lambda x: x.debt / x.assets, 
            log_assets=lambda x: np.log(x.assets),
            tangibility=lambda x: x.tang_assets / x.assets, 
            profitability=lambda x: x.revenue / x.assets
        )

filter_cond = (
    (data['short_leverage'] >= 0.) &
    (data['long_leverage'] >= 0.) &
    (data['leverage'] <= 1.) &
    (data.revenue > 0.0) &
    (data.assets >= 2100) &
    (data.assets < 871880628)
)

data = data.loc[filter_cond]

years = pd.DataFrame(np.arange(2004, 2010), columns=['Year'])

export_data = []
for _, item_df in tqdm(data.loc[~data.num_countries.isnull()].groupby('INN')):
    item_df = item_df.merge(years, on='Year', how='right')\
                .assign(
                    num_countries=lambda x: x.num_countries.fillna(0.0), 
                    num_countries_prev=lambda x: x.num_countries.shift(1),
                ).dropna(subset=['INN'])

    export_data.append(item_df)

export_data = pd.concat(export_data)\
        .assign(
            countries_diff=lambda x: x.num_countries - x.num_countries_prev,
        )[['INN', 'Year', 'num_countries_prev', 'countries_diff']]

data = data.merge(export_data, on=['INN', 'Year'], how='left')\
        .assign(
            num_countries_prev=lambda x: x.num_countries_prev.fillna(0.0),
            num_countries_prev_log=lambda x: np.log(1 + x.num_countries_prev),
            countries_diff=lambda x: x.countries_diff.fillna(0.0), 
            exposure=lambda x: 1 * (x.countries_diff > 0.0)
        )

data = data[data.Year > 2005]

data = data.assign(instrument=lambda x: x.instrument / 100.)

data.head()

100%|██████████| 17203/17203 [00:19<00:00, 900.95it/s]


Unnamed: 0,INN,OKVED,Year,tang_assets,assets,short_debt,revenue,opex,profit,long_debt,...,short_leverage,long_leverage,leverage,log_assets,tangibility,profitability,num_countries_prev,countries_diff,num_countries_prev_log,exposure
1,100001826,68.20.2,2006,3384000.0,14965000.0,3161000.0,55631000.0,941000.0,4626000.0,851000.0,...,0.211226,0.056866,0.268092,16.521225,0.226128,3.717407,0.0,0.0,0.0,0
2,100001826,68.20.2,2007,3121000.0,16417000.0,1806000.0,50909000.0,245000.0,3846000.0,706000.0,...,0.110008,0.043004,0.153012,16.613828,0.190108,3.100993,0.0,0.0,0.0,0
3,100001826,68.20.2,2008,4022000.0,19207000.0,1144000.0,56290000.0,118000.0,5799000.0,504000.0,...,0.059562,0.02624,0.085802,16.770785,0.209403,2.930702,0.0,0.0,0.0,0
5,100002227,96.01,2006,1403000.0,5251000.0,417000.0,4992000.0,125000.0,236000.0,0.0,...,0.079413,0.0,0.079413,15.473929,0.267187,0.950676,0.0,0.0,0.0,0
6,100002227,96.01,2007,867000.0,6941000.0,968000.0,5292000.0,108000.0,304000.0,1350000.0,...,0.139461,0.194496,0.333958,15.752956,0.12491,0.762426,0.0,0.0,0.0,0


In [5]:
data[["instrument", "num_countries"]].corr()

Unnamed: 0,instrument,num_countries
instrument,1.0,0.038579
num_countries,0.038579,1.0


In [6]:
data.groupby("exposure").instrument.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
exposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,910915.0,1.433636,1.222289,0.0,0.479479,1.24294,2.014483,7.015656
1,13698.0,2.000912,1.317196,0.0,1.138201,1.684669,2.622908,7.015656


In [7]:
data.groupby("exposure").INN.count()

exposure
0    910915
1     13698
Name: INN, dtype: int64

In [8]:
data.groupby("exposure").leverage.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
exposure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,910915.0,0.569541,0.334114,0.0,0.261313,0.620438,0.892916,1.0
1,13698.0,0.669934,0.290279,0.0,0.453615,0.756282,0.927326,1.0


In [11]:
target = "exposure"
reg = "instrument"
controls = ['log_assets', 'tangibility', 'profitability', 'num_countries_prev_log']

cols = [reg] + controls
to_study = data.set_index(['INN', 'Year'])[cols + [target, "leverage"]].dropna(subset=cols + [target, "leverage"])

exog = sm.add_constant(to_study[cols])
model = PanelOLS(to_study[target], exog, time_effects=True)
result = model.fit(cov_type='robust')

result.summary

0,1,2,3
Dep. Variable:,exposure,R-squared:,0.0698
Estimator:,PanelOLS,R-squared (Between):,0.1268
No. Observations:,923798,R-squared (Within):,-0.1069
Date:,"Mon, May 05 2025",R-squared (Overall):,0.0698
Time:,18:34:48,Log-likelihood,6.747e+05
Cov. Estimator:,Robust,,
,,F-statistic:,1.387e+04
Entities:,466482,P-value,0.0000
Avg Obs:,1.9804,Distribution:,"F(5,923790)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-0.0652,0.0010,-65.485,0.0000,-0.0671,-0.0632
instrument,0.0037,0.0001,32.489,0.0000,0.0035,0.0039
log_assets,0.0040,5.943e-05,67.609,0.0000,0.0039,0.0041
tangibility,0.0118,0.0004,31.310,0.0000,0.0111,0.0126
profitability,4.225e-08,2.86e-08,1.4770,0.1397,-1.382e-08,9.831e-08
num_countries_prev_log,0.1755,0.0029,60.572,0.0000,0.1699,0.1812


In [10]:
exog = exog.assign(exp_pred = result.predict(exog)).drop(columns=["instrument"])
model = PanelOLS(to_study["leverage"], exog, entity_effects=True, time_effects=True)
result = model.fit(cov_type='robust')

result.summary

0,1,2,3
Dep. Variable:,leverage,R-squared:,0.1508
Estimator:,PanelOLS,R-squared (Between):,-0.1267
No. Observations:,923798,R-squared (Within):,0.1312
Date:,"Mon, May 05 2025",R-squared (Overall):,-0.1297
Time:,18:05:21,Log-likelihood,8.781e+05
Cov. Estimator:,Robust,,
,,F-statistic:,1.625e+04
Entities:,466482,P-value,0.0000
Avg Obs:,1.9804,Distribution:,"F(5,457309)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-0.9673,0.0306,-31.587,0.0000,-1.0273,-0.9073
log_assets,0.0962,0.0020,48.112,0.0000,0.0923,0.1001
tangibility,0.0504,0.0074,6.8127,0.0000,0.0359,0.0649
profitability,-1.033e-06,8.565e-07,-1.2063,0.2277,-2.712e-06,6.456e-07
num_countries_prev_log,-0.2771,0.0801,-3.4606,0.0005,-0.4340,-0.1201
exp_pred,1.5070,0.4527,3.3289,0.0009,0.6197,2.3943


In [11]:
data.head()

Unnamed: 0,INN,OKVED,Year,tang_assets,assets,short_debt,revenue,opex,profit,long_debt,...,short_leverage,long_leverage,leverage,log_assets,tangibility,profitability,num_countries_prev,countries_diff,num_countries_prev_log,exposure
1,100001826,68.20.2,2006,3384000.0,14965000.0,3161000.0,55631000.0,941000.0,4626000.0,851000.0,...,0.211226,0.056866,0.268092,16.521225,0.226128,3.717407,0.0,0.0,0.0,0
2,100001826,68.20.2,2007,3121000.0,16417000.0,1806000.0,50909000.0,245000.0,3846000.0,706000.0,...,0.110008,0.043004,0.153012,16.613828,0.190108,3.100993,0.0,0.0,0.0,0
3,100001826,68.20.2,2008,4022000.0,19207000.0,1144000.0,56290000.0,118000.0,5799000.0,504000.0,...,0.059562,0.02624,0.085802,16.770785,0.209403,2.930702,0.0,0.0,0.0,0
5,100002227,96.01,2006,1403000.0,5251000.0,417000.0,4992000.0,125000.0,236000.0,0.0,...,0.079413,0.0,0.079413,15.473929,0.267187,0.950676,0.0,0.0,0.0,0
6,100002227,96.01,2007,867000.0,6941000.0,968000.0,5292000.0,108000.0,304000.0,1350000.0,...,0.139461,0.194496,0.333958,15.752956,0.12491,0.762426,0.0,0.0,0.0,0


In [13]:
target = "leverage"
reg = "exposure"
controls = ['log_assets', 'tangibility', 'profitability', 'num_countries_prev_log']
instrument = "instrument"

cols = [reg] + controls + [instrument]
to_study = data.set_index(['INN', 'Year'])[cols + [target]].dropna(subset=cols + [target])

exog = sm.add_constant(to_study[controls])
endog = to_study[reg]
instrument = to_study[["instrument"]]

model = IV2SLS(to_study[target], exog, endog, instrument)
result = model.fit(cov_type='robust')

result.summary

0,1,2,3
Dep. Variable:,leverage,R-squared:,-4.8710
Estimator:,IV-2SLS,Adj. R-squared:,-4.8710
No. Observations:,923798,F-statistic:,2.181e+04
Date:,"Mon, May 05 2025",P-value (F-stat),0.0000
Time:,18:35:37,Distribution:,chi2(5)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.0280,0.0141,1.9853,0.0471,0.0004,0.0556
log_assets,0.0168,0.0009,18.506,0.0000,0.0150,0.0186
tangibility,0.2662,0.0037,71.843,0.0000,0.2589,0.2734
profitability,-1.55e-06,9.011e-07,-1.7197,0.0855,-3.316e-06,2.165e-07
num_countries_prev_log,-1.1934,0.0415,-28.748,0.0000,-1.2748,-1.1121
exposure,6.4241,0.2078,30.910,0.0000,6.0167,6.8314


In [14]:
# Сохраним данные для Stata
to_study = data.assign(exporting=lambda x: 1 * (x["num_countries"] > 0))\
    .loc[:,
    ["INN", "Year", "okved_four", "exporting", "exposure"] + cols + ["leverage", "short_leverage", "long_leverage"]
].dropna()\
    .drop_duplicates(subset=["INN", "Year"])\

inns = to_study.INN.unique()

inns = pd.DataFrame(inns, columns=["INN"])\
        .reset_index().rename(columns={"index": "firm_id"})

to_study = to_study.merge(inns, on=["INN"], how="inner")

colss = ['firm_id', 'Year', 'okved_four', 'exporting', 'exposure', 'log_assets',
       'tangibility', 'profitability', 'num_countries_prev_log', 'instrument',
       'leverage', 'short_leverage', 'long_leverage']

to_study[colss].to_csv(os.path.join(data_dir, "testing/cur_spark_only_v1_test.csv"), index=False)

In [18]:
os.path.join(data_dir, "testing/cur_spark_only_v1_test.csv")

'/Users/mac/Desktop/Study/Diploma/data/testing/cur_spark_only_v1_test.csv'

In [17]:
to_study[["instrument", "exporting"]].corr()

Unnamed: 0,instrument,exporting
instrument,1.0,-0.020019
exporting,-0.020019,1.0


In [None]:
iv_path = os.path.join(data_dir, "instrument/iv.parquet")

iv_df = pd.read_parquet(iv_path)\
        .assign(instrument=lambda x: x["weight"] * x["tariff"])\
        .assign(okved_four=lambda x: x["okved_four"].astype(float))

result = []
for _, item_df in tqdm(iv_df.groupby(["okved_four", "product"])):
    item_df = item_df.sort_values(by=["year"])
    item_df = item_df.assign(delta_t=lambda x: x.tariff.diff(1))
    result.append(item_df)

iv_df = pd.concat(result)