In [1]:
import pandas as pd
import pyarrow
from sklearn.model_selection import train_test_split
from pathlib import Path

In [None]:
# read each dataset chunk
df = pd.read_parquet(
        'dataset/ruslawod30000.parquet', # change name by hand
        engine="pyarrow"
    )

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/ruslawod300000.parquet'

In [None]:
# separate classifier_code and classifier_name
df[['classifier_code', 'classifier_name']] = df['classifierByIPS'].str.split('$', n=1, expand=True)

In [None]:
df['classifier_code'].value_counts()

In [None]:
# since classifier_code variate too much, we will classify by first 2 numbers of classifier_code
df['classifier_level2'] = df['classifier_code'].str.extract(r'^(\d{3}\.\d{3})')

# Check distribution
df['classifier_level2'].value_counts()


classifier_level2
010.140    4031
010.070    1199
210.010     658
210.020     499
020.010     420
Name: count, dtype: int64

In [None]:
# keep dataset with index and classifier_level2 for each chunk
classifier_code = df[['classifier_level2']]
classifier_code.to_parquet('dataset/classifier_code_281233.parquet')

In [None]:
''' Concat classifier_code chunks in one dataset '''
DATA_DIR = Path("dataset/")     
PATTERN = "classifier_code_*.parquet" 

dfs = []
for f in DATA_DIR.glob(PATTERN):
    df = pd.read_parquet(
        f,
        engine="pyarrow"
    )
    dfs.append(df)
df_full = pd.concat(dfs, ignore_index=True)

In [13]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 227290 entries, 0 to 281232
Data columns (total 1 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   classifier_level2  227290 non-null  object
dtypes: object(1)
memory usage: 3.5+ MB


In [None]:
df_full.classifier_level2.value_counts()
# overall 199 classes

classifier_level2
010.140    101063
010.070     30507
210.010     16376
210.020     12151
020.010     11105
            ...  
040.020         1
060.000         1
090.110         1
110.080         1
070.090         1
Name: count, Length: 199, dtype: int64

In [None]:
# drop empty classifier_level2
df_full = df_full.dropna(subset=['classifier_level2'])

In [None]:
code_counts = df_full['classifier_level2'].value_counts()
valid_codes = code_counts[code_counts >= 2].index # if code frequency = 1, it considered as outlier
df_filtered = df_full[df_full['classifier_level2'].isin(valid_codes)]

# make only 50k for easy and fast work with dataset
# we want the same distribution, so we use stratify by classifier_level2
df_sampled, _ = train_test_split(
    df_filtered,
    train_size=50000,
    stratify=df_filtered['classifier_level2'],
    random_state=42
)

Финальный размер выборки: (50000, 1)
Распределение классов:
classifier_level2
010.140    0.44468
010.070    0.13424
210.010    0.07206
210.020    0.05346
020.010    0.04886
Name: proportion, dtype: float64


In [None]:
print(f"Финальный размер выборки: {df_sampled.shape}")
print(f"Распределение классов:\n{df_sampled['classifier_level2'].value_counts(normalize=True)}")

Финальный размер выборки: (50000, 1)
Распределение классов:
classifier_level2
010.140    0.44468
010.070    0.13424
210.010    0.07206
210.020    0.05346
020.010    0.04886
            ...   
080.090    0.00002
200.000    0.00002
160.020    0.00002
070.050    0.00002
120.070    0.00002
Name: proportion, Length: 169, dtype: float64


In [None]:
# save 50k indexes with code
df_sampled.to_parquet('dataset/sample_50k.parquet')