In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import partial

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.linear_model import OLS
from linearmodels import PanelOLS
from linearmodels.iv.model import IV2SLS
from linearmodels.panel.results import compare

data_dir = "/Users/mac/Desktop/Study/Diploma/data"

Основная подготовка данных проделана в cur_spark_ruslana_v2.ipynb!!!

# Filtering Countries

In [2]:
def dummy_country(x, code):
    return 1 * np.any(x == code)

def return_unique(x):
    return len(x.unique())

In [3]:
PARTNERS = 25
gtd_path = os.path.join(data_dir, "gtd/gtd_processed")
WITS_path = os.path.join(data_dir, "countries", "WITS_codes.xlsx")

df = pd.read_parquet(os.path.join(gtd_path, "gtd2005.parquet"))
df.columns = [item.lower() for item in df.columns]

WITS_df = pd.read_excel(WITS_path).drop(columns="ISO3")

In [4]:
top_countries = df.groupby("code").agg({"inn": return_unique})\
                .sort_values(by="inn", ascending=False).iloc[:PARTNERS]\
                .merge(WITS_df, on="code").loc[:,["code", "country"]]

In [5]:
years= [2005, 2006, 2007, 2008, 2009]

result = top_countries
for year in tqdm(years):
    item_df = pd.read_parquet(os.path.join(gtd_path, f"gtd{year}.parquet"))
    item_df.columns = [item.lower() for item in item_df.columns]
    
    item_df = item_df.groupby("code").agg({"inn": return_unique})\
            .rename(columns={"inn": "partners_{}".format(year)})
    
    result = result.merge(item_df, on="code", how="inner")

result

100%|██████████| 5/5 [00:00<00:00,  8.13it/s]


Unnamed: 0,code,country,partners_2005,partners_2006,partners_2007,partners_2008,partners_2009
0,398,Kazakhstan,8999,9519,10016,9968,11018
1,804,Ukraine,5902,6006,5871,5915,5468
2,156,China,2937,2735,2541,2195,2694
3,276,Germany,2625,2625,2723,2696,2984
4,440,Lithuania,2465,2382,2185,1890,1689
5,860,Uzbekistan,2428,2417,2744,2784,3143
6,428,Latvia,2351,2215,2176,1842,1620
7,233,Estonia,1958,1849,1628,1128,1125
8,31,Azerbaijan,1877,1879,2137,2258,2417
9,246,Finland,1738,1592,1396,1269,1393


In [6]:
top_cty_list = top_countries.code.unique()
funcs = [
    (f"{code}", partial(dummy_country, code=code))
    for code in top_cty_list
]

result = []
for year in tqdm(years):
    item_df = pd.read_parquet(os.path.join(gtd_path, f"gtd{year}.parquet"))
    item_df.columns = [item.lower() for item in item_df.columns]
    
    item_df = item_df.groupby("inn").agg({"code": funcs})\
                .assign(year=year)

    item_df.columns = ["_".join(item).rstrip("_") for item in item_df]
    
    result.append(item_df)

result = pd.concat(result).reset_index()

100%|██████████| 5/5 [01:27<00:00, 17.59s/it]


In [7]:
result.head()

Unnamed: 0,inn,code_398,code_804,code_156,code_276,code_440,code_860,code_428,code_233,code_31,...,code_792,code_392,code_51,code_762,code_528,code_826,code_410,code_100,code_250,year
0,0,1,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,2005
1,1,1,1,0,1,0,1,0,1,1,...,0,0,0,1,0,0,0,0,0,2005
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2005
3,104003856,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2005
4,105000304,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2005


In [8]:
result.to_parquet(os.path.join(data_dir, "gtd/customs_advanced_v2.parquet"), index=False)

# Data Preparation

In [27]:
def return_unique(x):
    return len(x.unique())

top_cty_list = [398, 804, 156, 276, 440, 860, 428, 233,  31, 246, 498, 840, 616,
       417, 268, 380, 792, 392,  51, 762, 528, 826, 410, 100, 250]

In [28]:
spark_path = os.path.join(data_dir, "spark/nxt_spark_data.parquet")
ruslana_path = os.path.join(data_dir, "ruslana/ruslana.parquet")
gtd_path = os.path.join(data_dir, "gtd/gtd_processed")
iv_path = os.path.join(data_dir, "instrument/iv_advanced.parquet")
gtd_advanced_path = os.path.join(data_dir, "gtd/customs_advanced_v2.parquet")

os.listdir(gtd_path)
spark_df = pd.read_parquet(spark_path)

spark_df.columns = [item.lower() for item in spark_df.columns]
            
spark_df = spark_df.loc[~spark_df["okved_four"].isin(['nan', 'None'])]
iv_df = pd.read_parquet(iv_path)

tables = os.listdir(gtd_path)
gtd_df = []
for table in tables:
    df = pd.read_parquet(os.path.join(gtd_path, table))
    df = df.loc[df["INN"] > 100]\
            .groupby(["INN", "year"]).agg({"code": return_unique})\
            .reset_index().rename(columns={"code": "num_countries"})
    gtd_df.append(df)
    
gtd_df = pd.concat(gtd_df)
gtd_df.columns = [item.lower() for item in gtd_df.columns]

ruslana_df = pd.read_parquet(ruslana_path)

print(len(spark_df), len(gtd_df), len(ruslana_df), len(iv_df))

df = spark_df.merge(ruslana_df, on=["inn", "year"], how="inner")
print(len(df))
df = df.merge(gtd_df, on=["inn", "year"], how="left")
df = df.merge(iv_df, on=["okved_four", "year"], how="inner")
print(len(df))

df = df.drop_duplicates(["inn", "year"])

print(len(df))

2335164 139420 3946725 2373
1858619
1785827
1305256


In [29]:
filter_cond = (df.assets > 0.)

data = df.loc[filter_cond]\
        .sort_values(by=["inn", "year"])\
        .assign(
            short_leverage=lambda x: x.short_debt / x.assets, 
            long_leverage=lambda x: x.long_debt / x.assets, 
            leverage=lambda x: x.debt / x.assets, 
            log_assets=lambda x: np.log(x.assets),
            tangibility=lambda x: x.tang_assets / x.assets, 
            profitability=lambda x: x.revenue / x.assets
        )

print("Assets more then 0: {}".format(len(data)))

filter_cond = (
    (data["short_leverage"] >= 0.) &
    (data["long_leverage"] >= 0.) &
    (data["leverage"] <= 1.) &
    (data.revenue > 0.0) &
    (data.assets >= 2100) &
    (data.assets < 871880628) &
    (data.empl >= 5.0)
)

data = data.loc[filter_cond]
print("Employees no less than 5: {}".format(len(data)))

years = pd.DataFrame(np.arange(2004, 2010), columns=["year"])

export_data = []
for _, item_df in tqdm(data.loc[~data.num_countries.isnull()].groupby("inn")):
    item_df = item_df.merge(years, on="year", how="right")\
                .assign(
                    num_countries=lambda x: x.num_countries.fillna(0.0), 
                    num_countries_prev=lambda x: x.num_countries.shift(1),
                ).dropna(subset=["inn"])

    export_data.append(item_df)

export_data = pd.concat(export_data)\
        .assign(
            countries_diff=lambda x: x.num_countries - x.num_countries_prev,
        )[["inn", "year", "num_countries_prev", "countries_diff"]]

data = data.merge(export_data, on=["inn", "year"], how="left")\
        .assign(
            num_countries_prev=lambda x: x.num_countries_prev.fillna(0.0),
            num_countries_prev_log=lambda x: np.log(1 + x.num_countries_prev),
        )

data = data[data.year > 2005]

print(len(data))

customs_advanced = pd.read_parquet(gtd_advanced_path)
data = data.merge(customs_advanced, on=["inn", "year"], how="left")\
            .fillna(0)

data.head()

Assets more then 0: 1300603
Employees no less than 5: 812136


100%|██████████| 14239/14239 [00:15<00:00, 893.64it/s]


632176


Unnamed: 0,inn,okved,year,tang_assets,assets,short_debt,revenue,opex,profit,long_debt,...,code_380,code_792,code_392,code_51,code_762,code_528,code_826,code_410,code_100,code_250
0,101000021,47.73,2008,308000.0,318000.0,95000.0,1943000.0,0.0,76000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,101000078,74.2,2006,896000.0,1131000.0,1000.0,2568000.0,0.0,314000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,101000078,74.2,2007,1146000.0,1618000.0,1000.0,3971000.0,0.0,542000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101000078,74.2,2008,1492000.0,2022000.0,1000.0,6211000.0,0.0,487000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,101000430,42.11,2006,4218000.0,9369000.0,6157000.0,18020000.0,232000.0,-2232000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Сохраним данные для Stata
TO_DROP = [
    "inn",
    "tang_assets",
    "assets",
    "short_debt",
    "revenue",
    "opex",
    "profit",
    "long_debt",
    "debt",
    "empl",
    "num_countries",
    "num_countries_prev",
    "countries_diff",
    "okved"
]
to_study = data.assign(exporting=lambda x: 1 * (x["num_countries"] > 0))\
    .dropna()

inns = to_study["inn"].unique()

inns = pd.DataFrame(inns, columns=["inn"])\
        .reset_index().rename(columns={"index": "firm_id"})

to_study = to_study.merge(inns, on=["inn"], how="inner")\
            .drop(columns=TO_DROP)
print(len(to_study))

to_study.to_csv(os.path.join(data_dir, "testing/cur_spark_ruslana_advanced_v2_test.csv"), index=False)

632176


# Регрессии

In [31]:
top_cty_list = [398, 804, 156, 276, 440, 860, 428, 233,  31, 246, 498, 840, 616,
       417, 268, 380, 792, 392,  51, 762, 528, 826, 410, 100, 250]

In [32]:
data_path = os.path.join(data_dir, "testing/cur_spark_ruslana_advanced_v2_test.csv")

data = pd.read_csv(data_path)

singletons = data.groupby("firm_id").agg({"year": "count"})
singletons = singletons[singletons["year"] > 1].index

data = data[data["firm_id"].isin(singletons)]

In [33]:
target = "code_398"
reg = "iv_398"
controls = ['log_assets', 'tangibility', 'profitability', 'num_countries_prev_log']

cols = [reg] + controls
to_study = data.set_index(['firm_id', 'year'])[cols + [target, "leverage"]].dropna(subset=cols + [target, "leverage"])

exog = sm.add_constant(to_study[cols])
model = PanelOLS(to_study[target], exog, entity_effects=True)
result = model.fit(cov_type='robust')

result.summary

0,1,2,3
Dep. Variable:,code_398,R-squared:,0.0244
Estimator:,PanelOLS,R-squared (Between):,0.2276
No. Observations:,482987,R-squared (Within):,0.0244
Date:,"Sat, May 10 2025",R-squared (Overall):,0.1931
Time:,11:50:52,Log-likelihood,7.152e+05
Cov. Estimator:,Robust,,
,,F-statistic:,1494.2
Entities:,184200,P-value,0.0000
Avg Obs:,2.6221,Distribution:,"F(5,298782)"
Min Obs:,2.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,0.0044,0.0034,1.2826,0.1997,-0.0023,0.0111
iv_398,-0.0031,0.0049,-0.6292,0.5292,-0.0127,0.0065
log_assets,0.0005,0.0002,2.2822,0.0225,6.71e-05,0.0009
tangibility,0.0017,0.0012,1.3923,0.1638,-0.0007,0.0041
profitability,9.109e-08,4.835e-08,1.8841,0.0595,-3.667e-09,1.859e-07
num_countries_prev_log,0.1228,0.0067,18.195,0.0000,0.1096,0.1360


In [34]:
data.groupby("year")["code_398"].sum()

year
2006    2698.0
2007    2863.0
2008    2524.0
Name: code_398, dtype: float64