In [1]:
import pandas as pd
import json
import ast
import re
from constants import * 

In [2]:
# создаем файл для категорий второго уровня
res = {i: dict(enumerate(values)) for i, values in enumerate(FIRST_LEVEL.values())}

with open('zeroshot/data/zeroshot_topics_2.json', 'w', encoding='utf-8') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

## Создание json

In [12]:
# очистка текста
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'Unknown node type: \w+', '', text)
    text = ' '.join(text.split())
    return text

# keywords в список
def parse_keywords(kw_str):
    if pd.isna(kw_str) or kw_str.strip() == "":
        return []
    try:
        return ast.literal_eval(kw_str)
    except Exception:
        return [kw_str]
    
# функция для создания правльного json
def prepare_json(df, filename, root, columns_rename=COLUMNS_RENAME):
    df_json = df[list(columns_rename.keys())].rename(columns=columns_rename)
    df_json['abstract'] = df_json['abstract'].apply(clean_text)
    df_json['keywords'] = df_json['keywords'].apply(parse_keywords)

    records = df_json.to_dict(orient='records')
    print(f'Размер {root}{filename} df: {df_json.shape[0]}')

    with open(f'{root}{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=4)
    
    return f'{root}{filename}.json успешно сохранен' 

In [13]:
df = pd.read_csv('zeroshot/data/final_clean.csv')

df_train = df[df['first_OECD'] == 'нет'].copy()
df_test  = df[df['first_OECD'] != 'нет'].copy()

print(prepare_json(df_train, filename='train', root='zeroshot/data/train/'))
print(prepare_json(df_test, filename='test', root='zeroshot/data/test/'))

Размер zeroshot/data/train/train df: 2859
zeroshot/data/train/train.json успешно сохранен
Размер zeroshot/data/test/test df: 1306
zeroshot/data/test/test.json успешно сохранен


In [None]:
#df = pd.read_csv('')
df = pd.read_csv('zeroshot/data/final_clean.csv')
for i, k in enumerate(FIRST_LEVEL.keys()): 

    df_train = df[(df['first_OECD_pred'] == k) & (df['OECD'] == 'нет')].copy()
    df_test  = df[df['first_OECD'] == k].copy()

    print(prepare_json(df_train, filename=f'{i}', root='zeroshot/data/train/'))
    print(prepare_json(df_test, filename=f'{i}', root='zeroshot/data/test/'))

Размер zeroshot/data/train/0 df: 0
zeroshot/data/train/0.json успешно сохранен
Размер zeroshot/data/test/0 df: 222
zeroshot/data/test/0.json успешно сохранен
Размер zeroshot/data/train/1 df: 0
zeroshot/data/train/1.json успешно сохранен
Размер zeroshot/data/test/1 df: 79
zeroshot/data/test/1.json успешно сохранен
Размер zeroshot/data/train/2 df: 0
zeroshot/data/train/2.json успешно сохранен
Размер zeroshot/data/test/2 df: 52
zeroshot/data/test/2.json успешно сохранен
Размер zeroshot/data/train/3 df: 0
zeroshot/data/train/3.json успешно сохранен
Размер zeroshot/data/test/3 df: 9
zeroshot/data/test/3.json успешно сохранен
Размер zeroshot/data/train/4 df: 0
zeroshot/data/train/4.json успешно сохранен
Размер zeroshot/data/test/4 df: 840
zeroshot/data/test/4.json успешно сохранен
Размер zeroshot/data/train/5 df: 0
zeroshot/data/train/5.json успешно сохранен
Размер zeroshot/data/test/5 df: 104
zeroshot/data/test/5.json успешно сохранен
