In [1]:
import pandas as pd
import requests
import re
import time
import os
from bs4 import BeautifulSoup

def task_download_graph_extract(model_name, max_retries=5, delay=5):
    url = f'https://huggingface.co/{model_name}'
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            break
        except (requests.RequestException, requests.Timeout) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                raise

    task_doc = soup.select_one('body > div > main > div.SVELTE_HYDRATER.contents > header > div > div.mb-3.flex.flex-wrap.md\:mb-4')
    task = (None if task_doc is None else (task_doc.find('a', href=re.compile(r'/models\?pipeline_tag=.*')).text.strip() if task_doc.find('a', href=re.compile(r'/models\?pipeline_tag=.*')) else None))    

    downloads = soup.select_one('body > div:nth-of-type(1) > main > div:nth-of-type(2) > section:nth-of-type(2) > div:nth-of-type(1) > dl > dd')
    total = None if downloads is None else downloads.text.replace(',', '')

    target_div = soup.select_one('html > body > div > main > div:nth-of-type(2) > section:nth-of-type(2) > div:nth-of-type(1) > div')
    d_attribute = (target_div := soup.select_one('html > body > div > main > div:nth-of-type(2) > section:nth-of-type(2) > div:nth-of-type(1) > div')) and (first_path := target_div.find('path')) and first_path.get('d', None) or None

    return task, total, d_attribute

def parse_path_data(path_data):
    coordinates = re.findall(r'[-+]?[0-9]*\.?[0-9]+', path_data)
    points = [(float(coordinates[i]), float(coordinates[i + 1])) for i in range(0, len(coordinates), 2)]
    return points

def cal_graph(path_data, download_total):
    if path_data is None:
        return [0] * 30
    else:
        try:
            points = parse_path_data(path_data)
        except IndexError:
            return [0] * 30

        points = parse_path_data(path_data)
        max_day = max(point[0] for point in points)
        scale_factor = 30 / max_day
        normalized_points = [(x * scale_factor, 100 - y) for x, y in points]
        daily_downloads = [0] * 30

        for i in range(len(normalized_points) - 1):
            start_day, start_value = normalized_points[i]
            end_day, end_value = normalized_points[i + 1]
            start_day_int = int(start_day)
            end_day_int = int(end_day)
            range_span = end_day - start_day

            if start_day_int == end_day_int:
                if start_day_int < 30:
                    daily_downloads[start_day_int] += (start_value + end_value) / 2
            else:
                increment = (end_value - start_value) / range_span
                for j in range(start_day_int, min(end_day_int + 1, 30)):
                    if j == start_day_int:
                        fraction = 1 - (start_day - start_day_int)
                        daily_downloads[j] += start_value + fraction * increment
                    elif j == end_day_int:
                        fraction = end_day - end_day_int
                        daily_downloads[j] += start_value + fraction * increment
                    else:
                        fraction = (j - start_day) / (end_day - start_day)
                        daily_downloads[j] += start_value + fraction * increment

        total_downloads = sum(daily_downloads)
        scaling_factor = float(download_total) / total_downloads
        daily_downloads = [d * scaling_factor for d in daily_downloads]
        daily_downloads = [int(round(d)) for d in daily_downloads]

        difference = int(download_total) - sum(daily_downloads)
        adjustment_indices = list(range(30)) if difference > 0 else list(range(29, -1, -1))

        for i in range(abs(difference)):
            daily_downloads[adjustment_indices[i % 30]] += 1 if difference > 0 else -1

        for i in range(30):
            if daily_downloads[i] < 0:
                surplus = -daily_downloads[i]
                daily_downloads[i] = 0
                for j in range(30):
                    if daily_downloads[j] > surplus:
                        daily_downloads[j] -= surplus
                        break
                    else:
                        surplus -= daily_downloads[j]
                        daily_downloads[j] = 0

        return daily_downloads

def save_excel(model_name, start_index, end_index):
    model_task, download_total, path_data = task_download_graph_extract(model_name)
    daily_downloads = cal_graph(path_data, download_total)
    if model_task and model_task.lower() == 'transformers':
        model_task = None
    data = {
        "Model_name": [model_name],
        "Model_task": [model_task],
        **{f"{i+1}Day": [daily_downloads[i]] for i in range(30)},
    }
    df = pd.DataFrame(data)
    output_file = f"20240615_Daily_Download.csv"
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        combined_df = pd.concat([existing_df, df], ignore_index=True)
    else:
        combined_df = df
    
    combined_df.to_csv(output_file, index=False)

# Read the input CSV file
file_path = r"./sorted_hugging_face_model_influence_with_scores.csv"
df = pd.read_csv(file_path)
# 데이터프레임을 영향력 점수를 기준으로 내림차순으로 정렬
result_df = df.sort_values(by='total_score', ascending=False)
model_list = result_df['id'].tolist()
print(f"총 행 수: {len(model_list)} 행")

for i in range(0, 10000):
    print(model_list[i])
    save_excel(model_list[i], 0, len(model_list))
    print(f"{i+1}번 모델 데이터 수집 중입니다.")

print("Data collection completed")


  task_doc = soup.select_one('body > div > main > div.SVELTE_HYDRATER.contents > header > div > div.mb-3.flex.flex-wrap.md\:mb-4')


총 행 수: 717070 행
google-bert/bert-base-uncased
1번 모델 데이터 수집 중입니다.
openai/clip-vit-large-patch14
2번 모델 데이터 수집 중입니다.
openai-community/gpt2
3번 모델 데이터 수집 중입니다.
MIT/ast-finetuned-audioset-10-10-0.4593
4번 모델 데이터 수집 중입니다.
distilbert/distilbert-base-uncased
5번 모델 데이터 수집 중입니다.
sentence-transformers/all-MiniLM-L6-v2
6번 모델 데이터 수집 중입니다.
openai/clip-vit-base-patch32
7번 모델 데이터 수집 중입니다.
facebook/bart-large-cnn
8번 모델 데이터 수집 중입니다.
FacebookAI/xlm-roberta-base
9번 모델 데이터 수집 중입니다.
FacebookAI/roberta-base
10번 모델 데이터 수집 중입니다.
google-bert/bert-base-chinese
11번 모델 데이터 수집 중입니다.
distilbert/distilbert-base-uncased-finetuned-sst-2-english
12번 모델 데이터 수집 중입니다.
openai/whisper-large-v3
13번 모델 데이터 수집 중입니다.
google-t5/t5-base
14번 모델 데이터 수집 중입니다.
google-bert/bert-base-multilingual-cased
15번 모델 데이터 수집 중입니다.
facebook/bart-large-mnli
16번 모델 데이터 수집 중입니다.
google/vit-base-patch16-224
17번 모델 데이터 수집 중입니다.
runwayml/stable-diffusion-v1-5
18번 모델 데이터 수집 중입니다.
FacebookAI/roberta-large
19번 모델 데이터 수집 중입니다.
google/vit-base-patch16-224-in2