In [56]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import math
import time
import random
import requests
import json
import ast

In [57]:
df = pd.read_csv('../ai_model_data/raw_data/Model_06_15.csv')

In [61]:
list(df['id'])

['albert/albert-base-v1',
 'albert/albert-base-v2',
 'albert/albert-large-v1',
 'albert/albert-large-v2',
 'albert/albert-xlarge-v1',
 'albert/albert-xlarge-v2',
 'albert/albert-xxlarge-v1',
 'albert/albert-xxlarge-v2',
 'google-bert/bert-base-cased-finetuned-mrpc',
 'google-bert/bert-base-cased',
 'google-bert/bert-base-chinese',
 'google-bert/bert-base-german-cased',
 'google-bert/bert-base-german-dbmdz-cased',
 'google-bert/bert-base-german-dbmdz-uncased',
 'google-bert/bert-base-multilingual-cased',
 'google-bert/bert-base-multilingual-uncased',
 'google-bert/bert-base-uncased',
 'google-bert/bert-large-cased-whole-word-masking-finetuned-squad',
 'google-bert/bert-large-cased-whole-word-masking',
 'google-bert/bert-large-cased',
 'google-bert/bert-large-uncased-whole-word-masking-finetuned-squad',
 'google-bert/bert-large-uncased-whole-word-masking',
 'google-bert/bert-large-uncased',
 'almanach/camembert-base',
 'Salesforce/ctrl',
 'distilbert/distilbert-base-cased-distilled-squad

In [62]:
search_name = "google-bert/bert-base-cased"
index = df.index[df['id'] == search_name].item()
folder_name = search_name.split('/')[1]

### 문서 수집

In [63]:
url = f'https://huggingface.co/{search_name}'
response = requests.get(url, timeout=2)
response.raise_for_status()  # 요청 실패 시 예외 발생

soup = BeautifulSoup(response.text, 'html.parser')

os.makedirs(f"./models/{folder_name}", exist_ok=True)  # 폴더가 이미 존재해도 오류 발생하지 않음
model_card_path = os.path.join('models', folder_name, 'page.txt')

if soup:
    with open(model_card_path, 'w', encoding='utf-8') as file:
        file.write(str(soup))

### ISO code 수집

In [64]:
url = 'https://huggingface.co/languages'
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

table_card = soup.find_all('table')

language_iso_code = []
rows = soup.find_all('tr')

for row in tqdm(rows[1:], desc="Processing rows"):
    iso_code_td = row.find_all('td')[1]
    iso_code = iso_code_td.text.strip()
    language_iso_code.append(iso_code)

Processing rows: 100%|██████████| 4680/4680 [00:00<00:00, 43454.86it/s]


In [65]:
libraries_list1 = [
    "transformers", "sentence-transformers", "keras", "adapter-transformers", "speechbrain",
    "espnet", "spacy", "fasttext", "asteroid", "generic", "ml-agents", "stable-baselines3",
    "pytorch", "flair", "allennlp", "fairseq", "k2", "tensorflowtts", "fastai", "opennmt",
    "timm", "sklearn", "paddlenlp", "superb", "doctr", "mindspore", "pytorch-ie", "pyannote-audio",
    "pysentimiento", "stanza", "PyTorch", "nemo", "txtai", "fastpitch", "pytorch-lightning",
    "PyTorch Lightning", "diffusers", "sample-factory", "k2-sherpa", "JoeyNMT", "FastAI",
    "stable-diffusion", "tfhub", "pythae", "open_clip", "mlconsole", "cleanrl", "setfit",
    "span-marker", "Doc-UFCN", "fastMONAI", "EveryDream", "kenlm", "BERT", "paddle_nlp",
    "rl-algo-impls", "PyLaia", "lucidrains/gated-state-spaces-pytorch", "TTS", "yolov5",
    "onnx", "yolov6detect", "ultralytics", "skrl", "yolor", "xpmir", "tensorflow", "peft",
    "scvi-tools", "hezar", "ppdiffusers", "mbrl-lib", "aster", "gpt-neox", "archai", "deep-rl-course",
    "Spacy", "ggml", "ctranslate2", "pcdet", "KerasCV Stable Diffusion in Diffusers", "tf",
    "tdc", "transformers.js", "PaddleNLP", "Transformers", "parking-env", "bertopic", "zeroshot_classifier",
    "ctransformers", "KerasNLP", "yolo", "safetensors", "safe-rlhf", "torch", "minetest-baselines",
    "audiocraft", "Core ML", "easydel", "wildlife-datasets", "monai", "fairseq2", "mlc", "Pytorch",
    "output-small", "braindecode", "coqui", "optimum_neuronx", "yasep", "bitsandbytes, transformers, peft, accelerate, bitsandbytes, datasets, deepspeed, trl",
    "htrflow_core", "xmen", "CTranslate2", "muzero-general", "pufferlib", "pytorch_geometric",
    "scikit-learn", "textgen", "peft - PEFT 0.5.0", "mlx", "DanishFungi", "asteroid1111111111111111111111111111111",
    "pruna-engine", "llama.cpp", "seamless_communication", "Spacy Explosion", "femr", "https://github.com/monet-joe/Piano-Classification",
    "clmbr", "xtuner", "fire-and-smoke", "TraceBERT", "Megatron-LM", "mlx-llm", "RAGatouille",
    "clinicadl", "unity-sentis", "ExLlamaV2", "tokenizers", "aim", "Bunkatopics",
    "transformers, Unsloth, Peft, trl, accelerate, bitsandbytes", "Flax", "ml-aim", "hierarchy-transformers",
    "Keras", "peft,sfttrainer", "whisperkit", "nanotron", "trl", "LLaVA", "transformers, peft, torch",
    "OpenVINO", "transformers, peft", "fasttext, bert", "tensorflow, keras", "metavoice", "gguf",
    "https://github.com/ICE-PIXIU", "colbert-ai", "gliner", "sglang", "non", "mlx-image",
    "GGUF", "gemma.cpp", "gemma_torch", "keras-nlp", "jax", "atommic", "mobileclip",
    "transformers(OpenNMT)", "v-jepa", "Transformers PHP", "whisper", "grok", "clot",
    "transformers, pe", "keras3", "UniFormer", "ml-4m", "fla", "UniFormerV2",
    "UniDepth", "voicecraft", "michelangelo", "openlm", "recurrentgemma", "tflite",
    "ditto", "PyTorchModelHubMixin-template", "Haystack", "elm", "litgpt, transformers",
    "dust3r", "rasa", "BiRefNet", "en-tts", "my-meta-test", "Gen AI", "zho-tts",
    "transformers accelerate bitsandbytes", "pyannote", "llama-cpp", "torchvision",
    "timesfm", "torchtune", "big_vision", "llamafile", "light-embed", "transformers, PyTorch",
    "edsnlp", "ultralyticsplus", "unsloth", "gptq", "chat_tts", "delphi", "burial_mounds",
    "fugent", "Nvidia Nemo", "segmentation-models-pytorch", "executorch", "multimolecule"
]

libraries_list2 = [
    "PyTorch", "TensorFlow", "JAX", "Transformers", "Safetensors", "TensorBoard", "PEFT",
    "Diffusers", "GGUF", "stable-baselines3", "ONNX", "ml-agents", "sentence-transformers",
    "Keras", "Adapters", "setfit", "timm", "sample-factory", "Flair", "Transformers.js",
    "MLX", "spaCy", "fastai", "ESPnet", "Core ML", "OpenVINO", "NeMo", "Joblib", "Rust",
    "BERTopic", "TF Lite", "fastText", "OpenCLIP", "Scikit-learn", "PaddlePaddle", "speechbrain",
    "Fairseq", "Graphcore", "Asteroid", "AllenNLP", "Stanza", "llamafile", "SpanMarker",
    "paddlenlp", "Habana", "pyannote.audio", "pythae", "unity-sentis"
]

# 첫 글자가 대문자인 경우와 소문자인 경우를 모두 포함한 라이브러리 리스트 생성
def create_case_insensitive_list(libraries):
    case_insensitive_list = set()
    for lib in libraries:
        case_insensitive_list.add(lib.lower())
        case_insensitive_list.add(lib.capitalize())
    return case_insensitive_list

normalized_libraries_list1 = set(lib.lower().replace('-', ' ') for lib in libraries_list1)
normalized_libraries_list2 = create_case_insensitive_list(libraries_list2)

# 새로운 컬럼 생성
df['library_name'] = ''
df['language'] = ''

# Process libraries for the specific row
row = df.loc[index]

In [66]:
row

id                                     google-bert/bert-base-cased
author                                                 google-bert
created_at                               2022-03-02 23:29:04+00:00
last_modified                            2024-02-19 11:02:26+00:00
private                                                      False
gated                                                        False
disabled                                                       NaN
downloads                                                  4819762
likes                                                          231
library_name                                                      
tags             ['transformers', 'pytorch', 'tf', 'jax', 'safe...
pipeline_tag                                             fill-mask
arxiv                                             arxiv:1810.04805
dataset                                          dataset:wikipedia
region                                                   regio

In [67]:
# Process libraries for the specific row
row = df.loc[index]  # Select the row by index

tags = row['tags']

# Check if 'tags' is a string and needs to be evaluated
if isinstance(tags, str):
    tags = ast.literal_eval(tags)

# Extract libraries from the tags
libraries = [tag for tag in tags if tag.lower().replace('-', ' ') in normalized_libraries_list1 or tag.lower().replace('-', ' ') in normalized_libraries_list2]
df.at[index, 'library_name'] = ', '.join(libraries)

# Update 'tags' by removing libraries
tags = [tag for tag in tags if tag not in libraries]
df.at[index, 'tags'] = str(tags)

# Process languages for the specific row
languages = [tag for tag in tags if tag in language_iso_code]
df.at[index, 'language'] = ', '.join(languages)

# Update 'tags' by removing languages
tags = [tag for tag in tags if tag not in languages]
df.at[index, 'tags'] = str(tags)

# 'tags' 열을 리스트로 변환
tags = ast.literal_eval(df.at[index, 'tags'])  # Ensure 'tags' column is a list

# Function to remove duplicates based on specific columns
compare_columns = [
    'id', 'author', 'created_at', 'last_modified', 'private', 'gated', 'disabled', 'downloads', 'likes', 'library_name', 'pipeline_tag', 'arxiv', 'dataset', 'region'
]

def remove_duplicates(tags, compare_columns, row):
    for column in compare_columns:
        value = row[column]
        if isinstance(value, str) and value in tags:
            tags.remove(value)
    return tags

# Remove duplicates for this row
tags = remove_duplicates(tags, compare_columns, row)

# Update 'tags' for the current row
df.at[index, 'tags'] = str(tags)


In [68]:
loc_df = df.loc[index]

In [69]:
loc_df

id                                     google-bert/bert-base-cased
author                                                 google-bert
created_at                               2022-03-02 23:29:04+00:00
last_modified                            2024-02-19 11:02:26+00:00
private                                                      False
gated                                                        False
disabled                                                       NaN
downloads                                                  4819762
likes                                                          231
library_name     transformers, pytorch, tf, jax, safetensors, bert
tags             ['exbert', 'autotrain_compatible', 'endpoints_...
pipeline_tag                                             fill-mask
arxiv                                             arxiv:1810.04805
dataset                                          dataset:wikipedia
region                                                   regio

In [18]:
# Process libraries for the specific row
tags = row['tags']

# Check if 'tags' is a string and needs to be evaluated
if isinstance(tags, str):
    tags = ast.literal_eval(tags)

# Extract libraries from the tags
libraries = [tag for tag in tags if tag.lower().replace('-', ' ') in normalized_libraries_list1 or tag.lower().replace('-', ' ') in normalized_libraries_list2]
df.at[index, 'library_name'] = ', '.join(libraries)

# Update 'tags' by removing libraries
tags = [tag for tag in tags if tag not in libraries]
df.at[index, 'tags'] = str(tags)

# Process languages for the specific row
languages = [tag for tag in tags if tag in language_iso_code]
df.at[index, 'language'] = ', '.join(languages)

# Update 'tags' by removing languages
tags = [tag for tag in tags if tag not in languages]
df.at[index, 'tags'] = str(tags)


In [None]:
# 첫 번째 처리
for ind, row in tqdm(df.iterrows(), desc="Processing libraries", total=len(df)):
    tags = ast.literal_eval(row['tags'])
    libraries = [tag for tag in tags if tag.lower().replace('-', ' ') in normalized_libraries_list1 or tag.lower().replace('-', ' ') in normalized_libraries_list2]
    df.at[ind, 'library_name'] = ', '.join(libraries)
    row['tags'] = [tag for tag in tags if tag not in libraries]
    df.at[ind,'tags'] = str(row['tags'])

# 두 번째 처리
for ind, row in tqdm(df.iterrows(), desc="Processing languages", total=len(df)):
    tags = ast.literal_eval(row['tags'])
    languages = [tag for tag in tags if tag in language_iso_code]
    df.at[ind, 'language'] = ', '.join(languages)
    row['tags'] = [tag for tag in tags if tag not in languages]
    df.at[ind,'tags'] = str(row['tags'])

In [None]:
# 'tags' 열을 리스트로 변환
df['tags'] = df['tags'].apply(ast.literal_eval)

compare_columns = [
    'id', 'author', 'created_at', 'last_modified', 'private', 'gated', 'disabled', 'downloads', 'likes','library_name', 'pipeline_tag', 'arxiv', 'dataset', 'region'
]

def remove_duplicates(tags, compare_columns, row):
    for column in compare_columns:
        value = row[column]
        if isinstance(value, str) and value in tags:
            tags.remove(value)
    return tags

# tqdm을 사용하여 전체 DataFrame에 대한 진행 상황 표시
tqdm.pandas(desc="처리 중")

# apply 함수를 사용하여 각 행에 대해 중복 제거
df['tags'] = df.progress_apply(lambda row: remove_duplicates(row['tags'], compare_columns, row), axis=1)

In [29]:
loc_df['id']

'albert/albert-base-v1'

In [70]:
model_name = loc_df['id'].split('/')[1]

In [71]:
loc_df

id                                     google-bert/bert-base-cased
author                                                 google-bert
created_at                               2022-03-02 23:29:04+00:00
last_modified                            2024-02-19 11:02:26+00:00
private                                                      False
gated                                                        False
disabled                                                       NaN
downloads                                                  4819762
likes                                                          231
library_name     transformers, pytorch, tf, jax, safetensors, bert
tags             ['exbert', 'autotrain_compatible', 'endpoints_...
pipeline_tag                                             fill-mask
arxiv                                             arxiv:1810.04805
dataset                                          dataset:wikipedia
region                                                   regio

In [72]:
model_name = loc_df['id'].split('/')[1]
row_dict = loc_df.to_dict()  # row_dict를 JSON 파일로 저장

# NaN 값을 None으로 변환
for key, value in row_dict.items():
    if isinstance(value, float) and np.isnan(value):
        row_dict[key] = None

# JSON 파일로 저장
with open(f'./models/{model_name}/Abstract.json', 'w', encoding='utf-8') as json_file:
    json.dump(row_dict, json_file, ensure_ascii=False, indent=4)

### 컨텐츠 수집

In [73]:
file_path = os.path.join('models', folder_name, 'page.txt')
    
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

In [74]:
soup = BeautifulSoup(html_content, 'html.parser')
model_card = soup.find('div', class_='model-card-content')

In [75]:
section_file_path = os.path.join('models', folder_name, 'Contents.txt')

with open(section_file_path, 'w', encoding='utf-8') as section_file:
    section_file.write(str(model_card))

### 사이드 수집

In [76]:
side_section = soup.find('section', class_="pt-8 border-gray-100 md:col-span-5 pt-6 md:pb-24 md:pl-6 md:border-l order-first md:order-none")

In [77]:
section_file_path = os.path.join('models', folder_name, f"Side.txt")
            
with open(section_file_path, 'w', encoding='utf-8') as section_file:
    section_file.write(str(side_section))

### 사이드 딕셔너리(데이터세트, 어댑터, 파인튜닝, 병합, 양자화)

In [78]:
file_path = os.path.join('models', model_name, 'Side.txt')

with open(file_path, 'r', encoding='utf-8') as file:
    side_content = file.read()

soup = BeautifulSoup(side_content, 'html.parser')

def get_model_info(selector): # Helper 함수: 모델 수와 링크를 추출
        element = soup.select_one(f'a[href*="{selector}"]')
        if element:
            models_text = element.text
            model_count = int(models_text.split()[0])  # "42 models"에서 42 추출
            model_link = element['href']
        else:
            model_count = 0
            model_link = ""
        return model_count, model_link

adapters_count, adapters_link = get_model_info("base_model:adapter") # Adapters, Finetunes, Merge, Quantized 모델 수와 링크 추출
finetune_count, finetune_link = get_model_info("base_model:finetune")
merge_count, merge_link = get_model_info("base_model:merge")
quantized_count, quantized_link = get_model_info("base_model:quantized")

dataset_elements = soup.select('h4.text-md.truncate.font-mono') # 데이터세트 추출 (여러 개일 경우 모두 수집)
dataset_names = [element.text.strip() for element in dataset_elements]  # 데이터세트 이름 리스트로 저장

data = { # 딕셔너리 형태로 반환
    "Model Name": model_name,
    "Adapter Model Count": adapters_count,
    "Adapter Link": adapters_link,
    "Finetune Model Count": finetune_count,
    "Finetune Link": finetune_link,
    "Merge Model Count": merge_count,
    "Merge Link": merge_link,
    "Quantized Model Count": quantized_count,
    "Quantized Link": quantized_link,
    "Dataset Names": dataset_names  # 여러 개의 데이터세트 리스트로 반환
}

In [79]:
if isinstance(dataset_names, str): # 문자열로 된 리스트를 실제 리스트로 변환
    dataset_names = eval(dataset_names)

In [80]:
os.makedirs(f'./models/{model_name}/derived/dataset', exist_ok=True)

output_directory = f'./models/{model_name}/derived/dataset'
safe_file_name = f"{model_name}.json" # 파일명에 불가능한 문자가 포함되어 있는 경우 처리 (예: 슬래시 '/' 등)
file_path = os.path.join(output_directory, safe_file_name)

dataset = { # 파일에 저장할 내용
    'dataset': dataset_names
}

with open(file_path, 'w', encoding='utf-8') as f: # JSON 파일로 저장
    json.dump(dataset, f, indent=4, ensure_ascii=False)

In [81]:
os.makedirs(f'./models/{model_name}/derived', exist_ok=True)

In [82]:
os.makedirs(f'./models/{model_name}/derived/adapter', exist_ok=True)

In [83]:
def scrape_huggingface_models(data, model_name_list):
    """
    Hugging Face의 다양한 모델(어댑터, 파인튜닝, 병합, 양자화) 정보를 스크래핑하는 함수
    
    :param data: 모델 관련 정보를 포함한 딕셔너리
    """
    model_types = {
        "Adapter": ("Adapter Model Count", "Adapter Link"),
        "Finetune": ("Finetune Model Count", "Finetune Link"),
        "Merge": ("Merge Model Count", "Merge Link"),
        "Quantized": ("Quantized Model Count", "Quantized Link")
    }
    
    for model_type, (count_key, link_key) in model_types.items():
        # for i in tqdm(range(data[count_key]), desc=f"Processing {model_type} Models"):
        file_name = data['Model Name']
        page_number = math.ceil(data[count_key] / 30)
        for page in tqdm(range(page_number), desc=f"Processing pages for {file_name} ({model_type})"):
            try:
                base_url = f"https://huggingface.co/models?other=base_model:{model_type.lower()}:{model_name_list}&p={page}&sort=trending"
                response = requests.get(base_url, timeout=2)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.text, 'html.parser')
                section_content = soup.find(
                    'section', class_='pt-8 border-gray-100 col-span-full lg:col-span-6 xl:col-span-7 pb-12'
                )
                
                if section_content:
                    output_dir = f'./models/{file_name}/derived/{model_type.lower()}/'
                    os.makedirs(output_dir, exist_ok=True)
                    
                    with open(f'{output_dir}{file_name}_{page}page.txt', 'w', encoding='utf-8') as file:
                        file.write(str(section_content))
            
            except Exception as e:
                print(f"Error processing {file_name} ({model_type}) on page {page}: {e}")
                
            time.sleep(random.uniform(0.5, 1.5))

In [84]:
scrape_huggingface_models(data, search_name)

Processing pages for bert-base-cased (Adapter): 100%|██████████| 2/2 [00:03<00:00,  1.93s/it]
Processing pages for bert-base-cased (Finetune):  43%|████▎     | 31/72 [01:02<01:22,  2.02s/it]

Error processing bert-base-cased (Finetune) on page 31: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=2)


Processing pages for bert-base-cased (Finetune): 100%|██████████| 72/72 [02:29<00:00,  2.07s/it]
Processing pages for bert-base-cased (Merge): 0it [00:00, ?it/s]
Processing pages for bert-base-cased (Quantized): 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]


In [85]:
# Model folder names based on the model
output_dirs = ['adapter', 'finetune', 'merge', 'quantized']

model_folder = {
    'adapter': 'adapter_instance',
    'finetune': 'finetune_instance',
    'merge': 'merge_instance',
    'quantized': 'quantized_instance'
}

# Iterate over the output directories and print the folder paths
for output_dir in output_dirs:
    folder_path = os.path.join('models', model_name, 'derived', output_dir)
    print(f"Processing directory: {folder_path}")

    # Check if the directory exists, if not, skip it
    if not os.path.exists(folder_path):
        print(f"Skipping: {folder_path} (Directory not found)")
        continue  # Skip to the next folder

    # Directory to search for files
    directory = folder_path

    # File pattern dictionary to store model name and page numbers
    file_pattern_data = {}

    # Get all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('page.txt'):
            # Extract model name and page number from the filename
            model_name_part, page_part = filename.rsplit('_', 1)
            page_num_part = page_part.replace('page.txt', '')

            if page_num_part.isdigit():  # Check if page number is valid
                page_num = int(page_num_part)
                if model_name_part not in file_pattern_data:
                    file_pattern_data[model_name_part] = []
                file_pattern_data[model_name_part].append(page_num)

    # Collect model data by page numbers
    for model_name, page_numbers in file_pattern_data.items():
        max_page_number = max(page_numbers)  # Find the max page number

        model_info = {
            output_dir: []
        }

        for page_num in range(0, max_page_number + 1):
            file_path = os.path.join(directory, f'{model_name}_{page_num}page.txt')

            if os.path.exists(file_path):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()  # Read the whole file content

                soup = BeautifulSoup(content, 'html.parser')

                # Find all articles in the content
                articles = soup.find_all('article', class_='overview-card-wrapper group/repo')

                for article in articles:
                    # Extract model name from the article
                    model_name_tag = article.find('h4', class_='text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-indigo-600 text-smd')
                    model_name_extracted = model_name_tag.get_text(strip=True) if model_name_tag else None

                    model_info[output_dir].append(model_name_extracted)

                # Choose the correct folder from model_folder mapping
                target_folder = model_folder.get(output_dir)  # Get the folder name from model_folder mapping
                if target_folder:
                    # Create the target folder if it doesn't exist
                    os.makedirs(f'./models/{model_name}/derived/{target_folder}', exist_ok=True)
                    
                    # Save the model info to JSON in the corresponding folder
                    json_output_path = os.path.join(f'./models/{model_name}/derived/{target_folder}', f'{model_name}.json')
                    with open(json_output_path, "w") as json_file:
                        json.dump(model_info, json_file, indent=4)
                else:
                    print(f"Folder not found for {output_dir}")
            else:
                print(f"File not found: {file_path}")

Processing directory: models\bert-base-cased\derived\adapter
Processing directory: models\bert-base-cased\derived\finetune
File not found: models\bert-base-cased\derived\finetune\bert-base-cased_31page.txt
Processing directory: models\bert-base-cased\derived\merge
Skipping: models\bert-base-cased\derived\merge (Directory not found)
Processing directory: models\bert-base-cased\derived\quantized


In [86]:
def merge_model_data(base_folder, instance_folders):
    data = {}
    
    for folder in instance_folders:
        instance_path = os.path.join(base_folder, folder)
        if not os.path.exists(instance_path):
            continue
        
        json_files = [f for f in os.listdir(instance_path) if f.endswith('.json')]
        
        for file in json_files:
            file_path = os.path.join(instance_path, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                file_data = json.load(f)
                data.update(file_data)
    
    return data

base_folder = f'./models/{model_name}/derived'
instance_folders = ['adapter_instance', 'finetune_instance', 'quantized_instance', 'merge_instance', 'dataset']
model_side_data = merge_model_data(base_folder, instance_folders)

In [87]:
model_side_data

{'adapter': ['identrics/wasper_propaganda_detection_en',
  'tomhodemon/query_encoder_lora_v1',
  'tomhodemon/passage_encoder_lora_v1',
  'tomhodemon/fever-query_encoder-lora-bsz16-77588-gradacc1',
  'tomhodemon/fever_passage_encoder-lora-bsz16-77588-gradacc1',
  'MaximTitarenkoUIT/reward_test_on_filtered_data_lora',
  'MonkeyDdonut/aiVshuman_bert_2epochs',
  'sumittyagi25/test_trainer',
  'Rastael/bert-base-cased-sequence-classification',
  'stanpony/ml_medical_diagnosis',
  'EliasKD/LoRA-imdb-seq-cls',
  'EliasKD/LoRA-my-dataset-seq-cls',
  'MaggieZhang/test_trainer',
  'TransferGraph/bert-base-cased-finetuned-lora-tweet_eval_irony',
  'TransferGraph/bert-base-cased-finetuned-lora-tweet_eval_emotion',
  'TransferGraph/bert-base-cased-finetuned-lora-tweet_eval_hate',
  'rajevan123/STS-Lora-Fine-Tuning-Capstone-bert-testing-70-with-lower-r-mid',
  'Jahanzeb1/BERT-TextClassification',
  'dfoc99/bert-base-cased-finetuned-Astronomy_Thesaurus',
  'stonedsmv/BERT-peft_LoRA',
  'abhishekkumar

In [89]:
file_path = f"./models/{model_name}/Contents.json"  # 원하는 폴더 경로 지정

with open(file_path, "r", encoding="utf-8") as f:
    model_contents_data = json.load(f)

In [90]:
merged_dict = {**row_dict, **model_side_data, **model_contents_data}
print(merged_dict)

{'id': 'google-bert/bert-base-cased', 'author': 'google-bert', 'created_at': '2022-03-02 23:29:04+00:00', 'last_modified': '2024-02-19 11:02:26+00:00', 'private': False, 'gated': 'False', 'disabled': None, 'downloads': 4819762, 'likes': 231, 'library_name': 'transformers, pytorch, tf, jax, safetensors, bert', 'tags': "['exbert', 'autotrain_compatible', 'endpoints_compatible']", 'pipeline_tag': 'fill-mask', 'arxiv': 'arxiv:1810.04805', 'dataset': ['legacy-datasets/wikipedia', 'bookcorpus/bookcorpus'], 'region': 'region:us', 'license': 'license:apache-2.0', 'language': 'en', 'adapter': ['identrics/wasper_propaganda_detection_en', 'tomhodemon/query_encoder_lora_v1', 'tomhodemon/passage_encoder_lora_v1', 'tomhodemon/fever-query_encoder-lora-bsz16-77588-gradacc1', 'tomhodemon/fever_passage_encoder-lora-bsz16-77588-gradacc1', 'MaximTitarenkoUIT/reward_test_on_filtered_data_lora', 'MonkeyDdonut/aiVshuman_bert_2epochs', 'sumittyagi25/test_trainer', 'Rastael/bert-base-cased-sequence-classificat

In [91]:
merged_dict

{'id': 'google-bert/bert-base-cased',
 'author': 'google-bert',
 'created_at': '2022-03-02 23:29:04+00:00',
 'last_modified': '2024-02-19 11:02:26+00:00',
 'private': False,
 'gated': 'False',
 'disabled': None,
 'downloads': 4819762,
 'likes': 231,
 'library_name': 'transformers, pytorch, tf, jax, safetensors, bert',
 'tags': "['exbert', 'autotrain_compatible', 'endpoints_compatible']",
 'pipeline_tag': 'fill-mask',
 'arxiv': 'arxiv:1810.04805',
 'dataset': ['legacy-datasets/wikipedia', 'bookcorpus/bookcorpus'],
 'region': 'region:us',
 'license': 'license:apache-2.0',
 'language': 'en',
 'adapter': ['identrics/wasper_propaganda_detection_en',
  'tomhodemon/query_encoder_lora_v1',
  'tomhodemon/passage_encoder_lora_v1',
  'tomhodemon/fever-query_encoder-lora-bsz16-77588-gradacc1',
  'tomhodemon/fever_passage_encoder-lora-bsz16-77588-gradacc1',
  'MaximTitarenkoUIT/reward_test_on_filtered_data_lora',
  'MonkeyDdonut/aiVshuman_bert_2epochs',
  'sumittyagi25/test_trainer',
  'Rastael/ber

In [92]:
folder_path = f"./models/{model_name}/"  # 저장할 폴더
file_path = os.path.join(folder_path, "merged_data.json")

# 폴더가 없으면 생성
os.makedirs(folder_path, exist_ok=True)

# JSON 저장
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(merged_dict, f, indent=4, ensure_ascii=False)

print(f"파일 저장 완료: {file_path}")

파일 저장 완료: ./models/bert-base-cased/merged_data.json
