In [2]:
import pandas as pd
import json
import ast
import re
from constants import * 

In [None]:
# создаем файл для категорий второго уровня
res = {i: dict(enumerate(values)) for i, values in enumerate(FIRST_LEVEL.values())}

with open('zeroshot/data/zeroshot_topics_2.json', 'w', encoding='utf-8') as f:
    json.dump(res, f, ensure_ascii=False, indent=4)

## Создание json

In [4]:
# очистка текста
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'Unknown node type: \w+', '', text)
    text = ' '.join(text.split())
    return text

# keywords в список
def parse_keywords(kw_str):
    if pd.isna(kw_str) or kw_str.strip() == "":
        return []
    try:
        return ast.literal_eval(kw_str)
    except Exception:
        return [kw_str]
    
# функция для создания правльного json
def prepare_json(df, filename, root, columns_rename=COLUMNS_RENAME):
    df_json = df[list(columns_rename.keys())].rename(columns=columns_rename)
    df_json['abstract'] = df_json['abstract'].apply(clean_text)
    df_json['keywords'] = df_json['keywords'].apply(parse_keywords)

    records = df_json.to_dict(orient='records')
    print(f'Размер {root}{filename} df: {df_json.shape[0]}')

    with open(f'{root}{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=4)
    
    return f'{root}{filename}.json успешно сохранен' 

### 1 level

In [4]:
df = pd.read_csv('zeroshot/data/final_clean.csv')

df_train = df[df['first_OECD'] == 'нет'].copy()
df_test  = df[df['first_OECD'] != 'нет'].copy()

print(prepare_json(df_train, filename='train', root='zeroshot/data/train/'))
print(prepare_json(df_test, filename='test', root='zeroshot/data/test/'))

Размер zeroshot/data/train/train df: 2858
zeroshot/data/train/train.json успешно сохранен
Размер zeroshot/data/test/test df: 1303
zeroshot/data/test/test.json успешно сохранен


In [14]:
df = pd.read_csv('zeroshot/data/instruction_full.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   eLIBRARY ID          1303 non-null   int64 
 1   title_en             1303 non-null   object
 2   abstract_en          1303 non-null   object
 3   key_words_en         1303 non-null   object
 4   OECD                 1303 non-null   object
 5   first_OECD           1303 non-null   object
 6   predicted_classes    1303 non-null   object
 7   first_OECD_pred      1303 non-null   object
 8   first_sorted_preds   268 non-null    object
 9   OECD_pred            1196 non-null   object
 10  sorted_preds         199 non-null    object
 11  OECD_pred_2          1257 non-null   object
 12  secondary_OECD_pred  1303 non-null   object
dtypes: int64(1), object(12)
memory usage: 132.5+ KB


### 2 level

In [16]:
#df = pd.read_csv('')
df = pd.read_csv('zeroshot/data/final_clean.csv')
for i, k in enumerate(FIRST_LEVEL.keys()): 

    #df_train = df[(df['secondary_OECD_pred'] == k) & (df['OECD'] == 'нет')].copy()
    df_test  = df[df['first_OECD'] == k].copy()

    #print(prepare_json(df_train, filename=f'{i}', root='zeroshot/data/train/'))
    print(prepare_json(df_test, filename=f'{i}', root='zeroshot/data/test/'))

Размер zeroshot/data/test/0 df: 220
zeroshot/data/test/0.json успешно сохранен
Размер zeroshot/data/test/1 df: 78
zeroshot/data/test/1.json успешно сохранен
Размер zeroshot/data/test/2 df: 52
zeroshot/data/test/2.json успешно сохранен
Размер zeroshot/data/test/3 df: 9
zeroshot/data/test/3.json успешно сохранен
Размер zeroshot/data/test/4 df: 840
zeroshot/data/test/4.json успешно сохранен
Размер zeroshot/data/test/5 df: 104
zeroshot/data/test/5.json успешно сохранен


In [26]:
import pandas as pd
df = pd.read_csv('zeroshot/data/final_clean.csv')
df[df['eLIBRARY ID'] == 54167782]

Unnamed: 0,Ссылка на статью,DOI,eLIBRARY ID,article_type,year,language,title_en,key_words_en,abstract_en,authors_metadata_en,...,authors_metadata_ru,OECD,Название журнала,Издательство,ISSN,eISSN,Цитирования,first_OECD,text,num_tokens
1183,https://elibrary.ru/item.asp?id=54167782,10.3390/en16062652,54167782,статья в журнале - научная статья,2023,английский,ESTIMATION OF TAX EXPENDITURES STIMULATING THE...,"['TAX EXPENDITURES', 'TAX INCENTIVES', 'ENERGY...",The energy crisis caused by global structural ...,"[{'Author': 'TYURINA YU.', 'Institution': 'Fin...",...,,Computer and information sciences,ENERGIES,,1996-1073,1996-1073,7.0,Natural Sciences,ESTIMATION OF TAX EXPENDITURES STIMULATING THE...,463


In [28]:
df[df['eLIBRARY ID'] == 80969379]

Unnamed: 0,Ссылка на статью,DOI,eLIBRARY ID,article_type,year,language,title_en,key_words_en,abstract_en,authors_metadata_en,...,authors_metadata_ru,OECD,Название журнала,Издательство,ISSN,eISSN,Цитирования,first_OECD,text,num_tokens
218,https://elibrary.ru/item.asp?id=80969379,,80969379,статья в журнале - научная статья,2025,английский,ANALYSIS OF GRAIN PRODUCTION IN THE REPUBLIC O...,"['AGRICULTURAL ECONOMICS', 'PROCESSING INDUSTR...",The relevance of this study is due to the grow...,"[{'Author': 'TSYPIN A.', 'Institution': 'Finan...",...,,Economics and business,"SCIENTIFIC PAPERS. SERIES: MANAGEMENT, ECONOMI...",,2284-7995,2285-3952,0.0,Social Sciences,ANALYSIS OF GRAIN PRODUCTION IN THE REPUBLIC O...,394
