In [6]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

data_dir = "/Users/mac/Desktop/Study/Diploma/data"

# Data Preparation

In [2]:
def return_unique(x):
    return len(x.unique())

In [22]:
spark_path = os.path.join(data_dir, "spark/nxt_spark_data.parquet")
ruslana_path = os.path.join(data_dir, "ruslana/ruslana.parquet")
gtd_path = os.path.join(data_dir, "gtd/gtd_processed")
iv_path = os.path.join(data_dir, "instrument/iv.parquet")

os.listdir(gtd_path)
spark_df = pd.read_parquet(spark_path)

spark_df.columns = [item.lower() for item in spark_df.columns]
            
spark_df = spark_df.loc[~spark_df["okved_four"].isin(['nan', 'None'])]
iv_df = pd.read_parquet(iv_path)\
        .assign(instrument=lambda x: x["weight_c"] * x["tariff"])

iv_df = iv_df.groupby(["okved_four", "year"]).instrument.sum().reset_index()

tables = os.listdir(gtd_path)
gtd_df = []
for table in tables:
    df = pd.read_parquet(os.path.join(gtd_path, table))
    df = df.loc[df["INN"] > 100]\
            .groupby(["INN", "year"]).agg({"code": return_unique})\
            .reset_index().rename(columns={"code": "num_countries"})
    gtd_df.append(df)
    
gtd_df = pd.concat(gtd_df)
gtd_df.columns = [item.lower() for item in gtd_df.columns]

ruslana_df = pd.read_parquet(ruslana_path)

print(len(spark_df), len(gtd_df), len(ruslana_df), len(iv_df))

df = spark_df.merge(ruslana_df, on=["inn", "year"], how="inner")
print(len(df))
df = df.merge(gtd_df, on=["inn", "year"], how="left")
df = df.merge(iv_df, on=["okved_four", "year"], how="inner")
print(len(df))

df = df.drop_duplicates(["inn", "year"])

print(len(df))

2335164 139420 3946725 2397
1858619
1798165
1314231


In [27]:
filter_cond = (df.assets > 0.)

data = df.loc[filter_cond]\
        .sort_values(by=["inn", "year"])\
        .assign(
            short_leverage=lambda x: x.short_debt / x.assets, 
            long_leverage=lambda x: x.long_debt / x.assets, 
            leverage=lambda x: x.debt / x.assets, 
            log_assets=lambda x: np.log(x.assets),
            tangibility=lambda x: x.tang_assets / x.assets, 
            profitability=lambda x: x.revenue / x.assets
        )

print("Assets more then 0: {}".format(len(data)))

filter_cond = (
    (data["short_leverage"] >= 0.) &
    (data["long_leverage"] >= 0.) &
    (data["leverage"] <= 1.) &
    (data.revenue > 0.0) &
    (data.assets >= 2100) &
    (data.assets < 871880628) &
    (data.empl >= 3.0)
)

data = data.loc[filter_cond]
print("Employees no less than 5: {}".format(len(data)))

years = pd.DataFrame(np.arange(2004, 2010), columns=["year"])

export_data = []
for _, item_df in tqdm(data.loc[~data.num_countries.isnull()].groupby("inn")):
    item_df = item_df.merge(years, on="year", how="right")\
                .assign(
                    num_countries=lambda x: x.num_countries.fillna(0.0), 
                    num_countries_prev=lambda x: x.num_countries.shift(1),
                ).dropna(subset=["inn"])

    export_data.append(item_df)

export_data = pd.concat(export_data)\
        .assign(
            countries_diff=lambda x: x.num_countries - x.num_countries_prev,
        )[["inn", "year", "num_countries_prev", "countries_diff"]]

data = data.merge(export_data, on=["inn", "year"], how="left")\
        .assign(
            num_countries_prev=lambda x: x.num_countries_prev.fillna(0.0),
            num_countries_prev_log=lambda x: np.log(1 + x.num_countries_prev),
            countries_diff=lambda x: x.countries_diff.fillna(0.0), 
            exposure=lambda x: 1 * (x.countries_diff > 0.0)
        )

data = data[data.year > 2005]\
        .assign(instrument=lambda x: x.instrument / 100.)

print(len(data))

data.head()

Assets more then 0: 1309533
Employees no less than 5: 898991


100%|██████████| 14591/14591 [00:16<00:00, 896.93it/s]


700601


Unnamed: 0,inn,okved,year,tang_assets,assets,short_debt,revenue,opex,profit,long_debt,...,short_leverage,long_leverage,leverage,log_assets,tangibility,profitability,num_countries_prev,countries_diff,num_countries_prev_log,exposure
0,101000021,47.73,2008,308000.0,318000.0,95000.0,1943000.0,,76000.0,0.0,...,0.298742,0.0,0.298742,12.669807,0.968553,6.110063,0.0,0.0,0.0,0
2,101000078,74.2,2006,896000.0,1131000.0,1000.0,2568000.0,,314000.0,0.0,...,0.000884,0.0,0.000884,13.938613,0.792219,2.270557,0.0,0.0,0.0,0
3,101000078,74.2,2007,1146000.0,1618000.0,1000.0,3971000.0,,542000.0,0.0,...,0.000618,0.0,0.000618,14.296701,0.708282,2.454265,0.0,0.0,0.0,0
4,101000078,74.2,2008,1492000.0,2022000.0,1000.0,6211000.0,,487000.0,0.0,...,0.000495,0.0,0.000495,14.519598,0.737883,3.071711,0.0,0.0,0.0,0
6,101000430,42.11,2006,4218000.0,9369000.0,6157000.0,18020000.0,232000.0,-2232000.0,0.0,...,0.657167,0.0,0.657167,16.052917,0.450208,1.923364,0.0,0.0,0.0,0


In [28]:
# Сохраним данные для Stata
COLS = [
    "year",
    "okved_four",
    "exporting",
    "exposure",
    "log_assets",
    "tangibility",
    "profitability",
    "num_countries_prev_log",
    "instrument",
    "leverage",
    "short_leverage",
    "long_leverage"
]

to_study = data.assign(exporting=lambda x: 1 * (x["num_countries"] > 0))\
    .dropna(subset=["inn"] + COLS)

inns = to_study["inn"].unique()

inns = pd.DataFrame(inns, columns=["inn"])\
        .reset_index().rename(columns={"index": "firm_id"})

to_study = to_study.merge(inns, on=["inn"], how="inner")
print(len(to_study))

to_study[["firm_id"] + COLS].to_csv(os.path.join(data_dir, "testing/cur_spark_ruslana_v1_test.csv"), index=False)

700150
