# 전체적인 코드입니다.

## 여기서 중요한 점은 API 요청 제한에 걸리지 않아야 한다는 것입니다.

In [1]:
# pip install pandas requests

In [2]:
import pandas as pd
import requests
import gzip
from io import BytesIO
import wikipediaapi
import time


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## 덤프데이터 가져오기

In [3]:
# 데이터 다운로드 및 추출 함수
def download_pageviews(year, month, day, hour):
    url = f"https://dumps.wikimedia.org/other/pageviews/{year}/{year}-{month:0>2}/pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
    response = requests.get(url)
    
    if response.status_code == 200:
        # 파일 형식 확인
        if response.headers.get('Content-Type') == 'application/gzip' or url.endswith('.gz'):
            return response.content
        else:
            raise ValueError("The downloaded file is not a GZIP file.")
    else:
        raise ValueError(f"Failed to download file. Status code: {response.status_code}")

def extract_gz(data):
    with gzip.open(BytesIO(data), 'rb') as f:
        return f.read().decode('utf-8')

def parse_to_dataframe(data):
    lines = data.split('\n')
    records = [line.split() for line in lines if line]
    df = pd.DataFrame(records, columns=['domain_code', 'page_title', 'view_counts', 'another_column'])
    return df

def fetch_pageviews_to_dataframe(year, month, day, hour):
    gz_data = download_pageviews(year, month, day, hour)
    extracted_data = extract_gz(gz_data)
    df = parse_to_dataframe(extracted_data)
    return df


## 덤프데이터 정제하기

In [4]:
def clean_data(df, top_percentage=10):
    # 컬럼명 변경
    df.columns = ['LanguageCode', 'PageTitle', 'PageViews', 'Metadata']
    
    # Metadata 컬럼 DROP
    df.drop(columns=['Metadata'], inplace=True)
    
    # 결측치 처리 및 데이터 타입 변환
    df['PageViews'] = df['PageViews'].fillna(0).astype(int)
    
    # 페이지뷰 기준으로 정렬
    df_sorted = df.sort_values(by='PageViews', ascending=False)
    
    # 상위 top_percentage% 데이터 필터링
    top_n = int(len(df_sorted) * top_percentage / 100)
    df_filtered = df_sorted.head(top_n)
    
    # 제외할 페이지 접두사
    exclude_prefixes = [
        "Main_Page", "Special:", "Talk:", "User:", "User talk:", 
        "Category:", "Portal:", "File:", "Template:", "Help:", "Wikipedia:",
        "MediaWiki:", "Spécial:", "メインページ"
    ]
    df_filtered = df_filtered[~df_filtered['PageTitle'].str.startswith(tuple(exclude_prefixes))]
    
    # 언어 코드 필터링
    language_counts = df_filtered['LanguageCode'].value_counts()
    languages_to_remove = language_counts[language_counts <= 10].index
    df_filtered = df_filtered[~df_filtered['LanguageCode'].isin(languages_to_remove)]
    
    # 나라 딕셔너리 생성
    language_to_country = {
        'en': 'United States', 'ar': 'Saudi Arabia', 'zh': 'China',
        'es': 'Spain', 'fr': 'France', 'de': 'Germany', 'ru': 'Russia',
        'ko': 'Korea', 'ja': 'Japan', 'fa': 'Iran', 'vi': 'Vietnam',
        'he': 'Israel', 'id': 'Indonesia', 'it': 'Italy', 'pl': 'Poland',
        'pt': 'Portugal', 'tl': 'Philippines', 'tr': 'Turkey',
    }
    
    # Country 컬럼 생성
    df_filtered['Country'] = df_filtered['LanguageCode'].str.split('.').str[0].map(language_to_country)
    
    return df_filtered


## 3. 위키피디아 API를 통해 요약 정보 추출하기

In [5]:
# 다양한 언어에 대한 위키피디아 인스턴스를 생성하는 함수
def get_wikipedia_instance(language_code):
    return wikipediaapi.Wikipedia('Brainpedia/1.0 (cmhcms1115@naver.com)', language_code)

# 페이지 제목 리스트에 대해 요약 정보를 가져오는 함수
def fetch_summary_batch(language_code, page_titles):
    wiki_wiki = get_wikipedia_instance(language_code)
    summaries = {}
    for title in page_titles:
        try:
            page = wiki_wiki.page(title)
            summaries[title] = page.summary
        except Exception as e:
            summaries[title] = str(e)  # 에러 발생 시 에러 메시지 저장
    return summaries

# 데이터를 배치로 나누어 요청하고, 요청 후 대기하는 함수
def add_summaries_to_df(df):
    summaries = []
    batch_size = 10
    sleep_time = 4  # 초
    
    # 각 언어 코드에 대해 처리
    for language_code in df['LanguageCode'].unique():
        df_language = df[df['LanguageCode'] == language_code]
        page_titles = df_language['PageTitle'].tolist()
        
        for i in range(0, len(page_titles), batch_size):
            batch_titles = page_titles[i:i + batch_size]
            print(f"Fetching summaries for batch {i // batch_size + 1} in language {language_code}")
            
            batch_summaries = fetch_summary_batch(language_code, batch_titles)
            for title, summary in batch_summaries.items():
                summaries.append({'LanguageCode': language_code, 'PageTitle': title, 'Summary': summary})
            
            time.sleep(sleep_time)  # 배치 요청 후 대기
    
    # DataFrame으로 변환
    summaries_df = pd.DataFrame(summaries)
    df = df.merge(summaries_df, on=['LanguageCode', 'PageTitle'], how='left')
    
    return df



In [6]:
# 예시 사용
def main(year, month, day, hour):
    # 1. 덤프 데이터 가져오기
    df = fetch_pageviews_to_dataframe(year, month, day, hour)
    
    # 2. 데이터 정제
    df_cleaned = clean_data(df)
    
    # 3. 위키피디아 API를 통해 요약 정보 추가
    df_with_summaries = add_summaries_to_df(df_cleaned)
    
    return df_with_summaries

# 실행 예시
year, month, day, hour = 2024, 7, 31, 1
df_final = main(year, month, day, hour)
print(df_final.head())

Fetching summaries for batch 1 in language en.m
Fetching summaries for batch 2 in language en.m
Fetching summaries for batch 3 in language en.m
Fetching summaries for batch 4 in language en.m
Fetching summaries for batch 5 in language en.m
Fetching summaries for batch 6 in language en.m
Fetching summaries for batch 7 in language en.m
Fetching summaries for batch 8 in language en.m
Fetching summaries for batch 9 in language en.m
Fetching summaries for batch 10 in language en.m
Fetching summaries for batch 11 in language en.m
Fetching summaries for batch 12 in language en.m
Fetching summaries for batch 13 in language en.m
Fetching summaries for batch 14 in language en.m
Fetching summaries for batch 15 in language en.m
Fetching summaries for batch 16 in language en.m
Fetching summaries for batch 17 in language en.m
Fetching summaries for batch 18 in language en.m
Fetching summaries for batch 19 in language en.m
Fetching summaries for batch 20 in language en.m
Fetching summaries for batch 

In [None]:
df_final.info()