# 임계값 100

In [1]:
import requests
import gzip
import pandas as pd
from io import BytesIO

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# 데이터 다운로드 함수
def download_pageviews(year, month, day, hour):
    url = f"https://dumps.wikimedia.org/other/pageviews/{year}/{year}-{month:0>2}/pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
    response = requests.get(url)
    
    if response.status_code == 200:
        # 파일 형식 확인
        if response.headers.get('Content-Type') == 'application/gzip' or url.endswith('.gz'):
            return response.content
        else:
            raise ValueError("The downloaded file is not a GZIP file.")
    else:
        raise ValueError(f"Failed to download file. Status code: {response.status_code}")

# GZIP 파일 추출 함수
def extract_gz(data):
    with gzip.open(BytesIO(data), 'rb') as f:
        return f.read().decode('utf-8')

# 데이터 파싱 함수
def parse_to_dataframe(data):
    lines = data.split('\n')
    records = [line.split() for line in lines if line]
    df = pd.DataFrame(records, columns=['LanguageCode', 'PageTitle', 'PageViews', 'Metadata'])
    return df

# 데이터 다운로드 및 추출 함수
def fetch_pageviews_to_dataframe(year, month, day, hour):
    gz_data = download_pageviews(year, month, day, hour)
    extracted_data = extract_gz(gz_data)
    df = parse_to_dataframe(extracted_data)
    return df

In [3]:
# 데이터 정제 함수
def clean_data(df):
    # 컬럼명 변경
    df.columns = ['LanguageCode', 'PageTitle', 'PageViews', 'Metadata']
    
    # Metadata 컬럼 DROP
    df.drop(columns=['Metadata'], inplace=True)
    
    # 결측치 처리 및 데이터 타입 변환
    df['PageViews'] = df['PageViews'].fillna(0).astype(int)
    
    # 페이지뷰 기준으로 필터링 (임계값 100)
    df = df[df['PageViews'] >= 100]
    
    # 페이지뷰 기준으로 정렬
    df_sorted = df.sort_values(by='PageViews', ascending=False)
    
    # 제외할 페이지 접두사
    exclude_prefixes = [
        "Main_Page", "Special:", "Talk:", "User:", "User talk:", 
        "Category:", "Portal:", "File:", "Template:", "Help:", "Wikipedia:",
        "MediaWiki:", "Spécial:", "メインページ"
    ]
    df_filtered = df_sorted[~df_sorted['PageTitle'].str.startswith(tuple(exclude_prefixes))]
    
    # 언어 코드 필터링
    language_counts = df_filtered['LanguageCode'].value_counts()
    languages_to_remove = language_counts[language_counts <= 10].index
    df_filtered = df_filtered[~df_filtered['LanguageCode'].isin(languages_to_remove)]
    
    # PageTitle 양옆 공백 제거 및 언더스코어 처리
    df_filtered['PageTitle'] = df_filtered['PageTitle'].str.strip()
    df_filtered['PageTitle'] = df_filtered.apply(
        lambda row: row['PageTitle'].replace('_', ' ') if row['LanguageCode'] != 'ar' else row['PageTitle'],
        axis=1
    )
    
    # Country 컬럼 생성
    language_to_country = {
        'en': 'United States', 'ar': 'Saudi Arabia', 'zh': 'China',
        'es': 'Spain', 'fr': 'France', 'de': 'Germany', 'ru': 'Russia',
        'ko': 'Korea', 'ja': 'Japan', 'fa': 'Iran', 'vi': 'Vietnam',
        'he': 'Israel', 'id': 'Indonesia', 'it': 'Italy', 'pl': 'Poland',
        'pt': 'Portugal', 'tl': 'Philippines', 'tr': 'Turkey',
    }
    
    df_filtered['Country'] = df_filtered['LanguageCode'].str.split('.').str[0].map(language_to_country)
    
    # URL 컬럼 생성
    df_filtered['URL'] = df_filtered.apply(
        lambda row: f"https://{row['LanguageCode']}.wikipedia.org/wiki/{row['PageTitle'].replace(' ', '_')}",
        axis=1
    )

    # 인덱스 리셋
    df_filtered.reset_index(drop=True, inplace=True)
    
    return df_filtered

In [4]:
# 예제 실행
year, month, day, hour = 2024, 7, 31, 1
df = fetch_pageviews_to_dataframe(year, month, day, hour)
cleaned_df = clean_data(df)
print(cleaned_df.head())

  LanguageCode       PageTitle  PageViews        Country  \
0         en.m    Simone Biles      79720  United States   
1         en.m  Jonathan Owens      38655  United States   
2           en               -      34618  United States   
3         en.m      Sunisa Lee      27695  United States   
4         en.m   Jordan Chiles      25307  United States   

                                              URL  
0    https://en.m.wikipedia.org/wiki/Simone_Biles  
1  https://en.m.wikipedia.org/wiki/Jonathan_Owens  
2                 https://en.wikipedia.org/wiki/-  
3      https://en.m.wikipedia.org/wiki/Sunisa_Lee  
4   https://en.m.wikipedia.org/wiki/Jordan_Chiles  


In [5]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10073 entries, 0 to 10072
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   LanguageCode  10073 non-null  object
 1   PageTitle     10073 non-null  object
 2   PageViews     10073 non-null  int32 
 3   Country       10073 non-null  object
 4   URL           10073 non-null  object
dtypes: int32(1), object(4)
memory usage: 354.3+ KB
