In [5]:
import pandas as pd
import pyarrow
from sklearn.model_selection import train_test_split
from pathlib import Path
import gc

In [7]:
for n in range(30000, 270001, 30000): # numbers in files
    print(f'Processing chunk ruslawod{n}.parquet ...')
    
    # read each dataset chunk
    df = pd.read_parquet(
            f'ruslawod{n}.parquet', # change name by hand
            engine="pyarrow"
        )
    # separate classifier_code and classifier_name
    df[['classifier_code', 'classifier_name']] = df['classifierByIPS'].str.split('$', n=1, expand=True)
    print(df['classifier_code'].value_counts())

    # since classifier_code variate too much, we will classify by first 2 numbers of classifier_code
    df['classifier_level2'] = df['classifier_code'].str.extract(r'^(\d{3}\.\d{3})')

    # Check distribution
    print(df['classifier_level2'].value_counts())

    # keep dataset with index and classifier_level2 for each chunk
    classifier_code = df[['classifier_level2']]
    classifier_code.to_parquet(f'classifier_code_{n}.parquet')


    del df
    del classifier_code
    gc.collect()

Processing chunk ruslawod30000.parquet ...
classifier_code
010.140.020.010.000    5575
010.140.030.010.000    4058
210.020.010.010.000     874
010.140.040.045.040     528
210.020.010.020.000     458
                       ... 
030.150.150.000.000       1
010.090.050.090.084       1
210.010.010.200.154       1
020.010.040.100.102       1
010.090.050.010.195       1
Name: count, Length: 1641, dtype: int64
classifier_level2
010.140    10832
010.070     3231
210.010     1668
210.020     1346
020.010     1233
           ...  
040.060        1
200.060        1
200.190        1
050.020        1
100.140        1
Name: count, Length: 139, dtype: int64
Processing chunk ruslawod60000.parquet ...
classifier_code
010.140.020.010.000    5660
010.140.030.010.000    4062
210.020.010.010.000     815
010.140.040.045.040     484
210.020.010.020.000     451
                       ... 
020.010.040.060.050       1
210.010.000.000.000       1
020.010.040.100.055       1
210.010.010.200.040       1
160.030.05

In [8]:
# read each dataset chunk
df = pd.read_parquet(
        'ruslawod281233.parquet', # change name by hand
        engine="pyarrow"
    )

In [9]:
# separate classifier_code and classifier_name
df[['classifier_code', 'classifier_name']] = df['classifierByIPS'].str.split('$', n=1, expand=True)

In [10]:
df['classifier_code'].value_counts()

classifier_code
010.140.020.010.000    2124
010.140.030.010.000    1515
210.020.010.010.000     309
210.020.010.020.000     185
010.140.040.045.040     158
                       ... 
060.020.180.060.000       1
010.090.040.900.113       1
020.010.040.100.139       1
090.010.130.020.000       1
010.090.040.020.260       1
Name: count, Length: 1061, dtype: int64

In [11]:
# since classifier_code variate too much, we will classify by first 2 numbers of classifier_code
df['classifier_level2'] = df['classifier_code'].str.extract(r'^(\d{3}\.\d{3})')

# Check distribution
df['classifier_level2'].value_counts()


classifier_level2
010.140    4031
010.070    1199
210.010     658
210.020     499
020.010     420
           ... 
160.030       1
010.130       1
090.080       1
150.070       1
150.010       1
Name: count, Length: 107, dtype: int64

In [12]:
# keep dataset with index and classifier_level2 for each chunk
classifier_code = df[['classifier_level2']]
classifier_code.to_parquet('classifier_code_281233.parquet')

In [20]:
''' Concat classifier_code chunks in one dataset '''
DATA_DIR = Path(".")     
PATTERN = "classifier_code_*.parquet" 

dfs = []
for f in DATA_DIR.glob(PATTERN):
    df = pd.read_parquet(
        f,
        engine="pyarrow"
    )
    dfs.append(df)
df_full = pd.concat(dfs, ignore_index=True)

In [21]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281233 entries, 0 to 281232
Data columns (total 1 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   classifier_level2  227290 non-null  object
dtypes: object(1)
memory usage: 2.1+ MB


In [22]:
df_full.classifier_level2.value_counts()
# overall 199 classes

classifier_level2
010.140    101063
010.070     30507
210.010     16376
210.020     12151
020.010     11105
            ...  
040.020         1
060.000         1
090.110         1
110.080         1
070.090         1
Name: count, Length: 199, dtype: int64

In [23]:
df_full.shape

(281233, 1)

In [24]:
# drop empty classifier_level2
df_full = df_full.dropna(subset=['classifier_level2'])

In [25]:
df_full.shape

(227290, 1)

In [26]:
code_counts = df_full['classifier_level2'].value_counts()
valid_codes = code_counts[code_counts >= 2].index # if code frequency = 1, it considered as outlier
df_filtered = df_full[df_full['classifier_level2'].isin(valid_codes)]

# make only 50k for easy and fast work with dataset
# we want the same distribution, so we use stratify by classifier_level2
df_sampled, _ = train_test_split(
    df_filtered,
    train_size=50000,
    stratify=df_filtered['classifier_level2'],
    random_state=42
)

In [27]:
print(f"Финальный размер выборки: {df_sampled.shape}")
print(f"Распределение классов:\n{df_sampled['classifier_level2'].value_counts(normalize=True)}")

Финальный размер выборки: (50000, 1)
Распределение классов:
classifier_level2
010.140    0.44468
010.070    0.13424
210.010    0.07206
210.020    0.05346
020.010    0.04886
            ...   
080.090    0.00002
200.000    0.00002
160.020    0.00002
070.050    0.00002
120.070    0.00002
Name: proportion, Length: 169, dtype: float64


In [30]:
df_sampled.isnull().values.any()

np.False_

In [32]:
# save 50k indexes with code
df_sampled.to_parquet('sample_50k.parquet')