In [183]:
import os
import numpy as np
import pandas as pd
from functools import partial

root_dir = "/Users/mac/Desktop/Study/Diploma"
sys.path.append(root_dir)

from py_scripts.process_countries import prepare_table

In [73]:
spark_path = os.path.join(root_dir, "data/spark", "cur_spark_data.parquet")
customs_path = os.path.join(root_dir, "data/gtd/gtd_processed", "gtd2005.parquet")

spark_cols = ["INN", "okved_four", "Year"]

spark_df = pd.read_parquet(spark_path)
customs_df = pd.read_parquet(customs_path)

spark_df = spark_df\
            .loc[(spark_df.Year == 2005) & (~spark_df["okved_four"].isin(['nan', 'None'])), spark_cols]

customs_df = customs_df.loc[(~customs_df["product"].isnull())]

print(len(spark_df), len(customs_df))

df = pd.merge(customs_df, spark_df, on=["INN"], how="inner")

df = df.groupby(["okved_four", "product", "code"]).agg({"value": "sum"}).reset_index()

agg_df = df.groupby(["okved_four"]).agg({"value": "sum"})\
            .reset_index().rename(columns={"value": "value_agg"})

df = df.merge(agg_df, on=["okved_four"], how="inner")\
        .assign(weight=lambda x: x.value / x.value_agg)\
        .drop(columns=["value_agg"])

agg_df = df.groupby(["okved_four", "code"]).agg({"value": "sum"})\
            .reset_index().rename(columns={"value": "value_agg"})

df = df.merge(agg_df, on=["okved_four", "code"], how="inner")\
        .assign(weight_c=lambda x: x.value / x.value_agg)\
        .drop(columns=["value_agg"])

613184 1758505


In [212]:
customs_path

'/Users/mac/Desktop/Study/Diploma/data/gtd/gtd_processed/gtd2005.parquet'

In [217]:
df

Unnamed: 0,okved_four,product,code,value,weight,weight_c
0,01.11,10110.0,804,5770.0,0.000277,0.020314
1,01.11,10190.0,804,31101.0,0.001495,0.109495
2,01.11,10210.0,496,7881.0,0.000379,0.953193
3,01.11,10410.0,496,387.0,0.000019,0.046807
4,01.11,30212.0,156,2750.0,0.000132,0.008326
...,...,...,...,...,...,...
102559,96.09,700992.0,428,2572.0,0.022189,1.000000
102560,96.09,720449.0,156,38112.0,0.328798,0.352429
102561,96.09,730210.0,156,43669.0,0.376739,0.403815
102562,96.09,732690.0,156,8655.0,0.074668,0.080034


In [77]:
df[df.okved_four == "28.92"].sort_values(by="value")

Unnamed: 0,okved_four,product,code,value,weight,weight_c
37095,28.92,870891.0,348,0.0,0.000000,0.000000
35791,28.92,681510.0,348,0.0,0.000000,0.000000
35832,28.92,730799.0,348,0.0,0.000000,0.000000
36640,28.92,848140.0,348,0.0,0.000000,0.000000
36894,28.92,851150.0,348,0.0,0.000000,0.000000
...,...,...,...,...,...,...
36609,28.92,847990.0,804,7992741.0,0.029786,0.152473
36375,28.92,842911.0,398,10249108.0,0.038195,0.170749
36455,28.92,843120.0,826,12143075.0,0.045253,0.996955
36495,28.92,843149.0,250,15346315.0,0.057190,0.994004


In [197]:
def return_unique(x):
    return len(x.unique())

WITS_path = os.path.join(root_dir, "data/countries", "WITS_codes.xlsx")
RUS_path = os.path.join(root_dir, "data/countries", "rus_countries.csv")
codes = prepare_table(WITS_path, RUS_path)

table = customs_df.groupby("code").agg({"INN": return_unique}).reset_index()\
        .sort_values(by="INN", ascending=False)

table = table.merge(codes, on="code", how="left")

best_countries = table.code.tolist()[:25]

In [205]:
def dummy_country(x, code=398):
    return 1 * np.any(x == code)

funcs = dict(
    code=[partial(dummy_country, code=t) for t in best_countries]
)

funcs = [
    (f"{code}", partial(dummy_country, code=code))
    for code in best_countries
]
    
result = customs_df.groupby("INN").agg({"code": funcs})

result.columns = ["_".join(item) for item in result.columns]

result = result.reset_index()

In [207]:
result

Unnamed: 0,INN,code_398,code_804,code_156,code_276,code_440,code_860,code_428,code_233,code_31,...,code_380,code_792,code_392,code_51,code_762,code_528,code_826,code_410,code_100,code_250
0,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,1,1,0,1,0,1,0,1,1,...,0,0,0,0,1,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,104003856,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,105000304,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28161,861500896274,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28162,861503688924,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28163,862000073193,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28164,891300277300,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
result = []
years = [2005, 2006, 2007, 2008, 2009]

for year in years:
    result.append(df.assign(current_year=year))