In [1]:
pip install pandas requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# 전세계 페이지

In [2]:
import pandas as pd
import requests
import gzip
from io import BytesIO

# 1. 데이터 다운로드
def download_pageviews(year, month, day, hour):
    url = (
        f"https://dumps.wikimedia.org/other/pageviews/"
        f"{year}/{year}-{month:0>2}/pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
    )
    response = requests.get(url)
    return response.content

# 2. 데이터 추출
def extract_gz(data):
    with gzip.open(BytesIO(data), 'rb') as f:
        return f.read().decode('utf-8')

# 3. 데이터 파싱 및 DataFrame 변환
def parse_to_dataframe(data):
    lines = data.split('\n')
    records = [line.split() for line in lines if line]
    # 데이터의 실제 컬럼 수에 맞춰 컬럼명 지정
    df = pd.DataFrame(records, columns=['domain_code', 'page_title', 'view_counts', 'another_column'])
    return df

# 4. 전체 파이프라인 실행
def fetch_pageviews_to_dataframe(year, month, day, hour):
    gz_data = download_pageviews(year, month, day, hour)
    extracted_data = extract_gz(gz_data)
    df = parse_to_dataframe(extracted_data)
    return df

# 예시 사용
year, month, day, hour = 2024, 7, 30, 0
df = fetch_pageviews_to_dataframe(year, month, day, hour)

# DataFrame 출력
df.head()


# 2시간 덤프 데이터 GO!

In [3]:
import pandas as pd
import requests
import gzip
from io import BytesIO

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# 데이터 다운로드
def download_pageviews(year, month, day, hour):
    url = (
        f"https://dumps.wikimedia.org/other/pageviews/"
        f"{year}/{year}-{month:0>2}/pageviews-{year}{month:0>2}{day:0>2}-{hour:0>2}0000.gz"
    )
    response = requests.get(url)
    return response.content

# 데이터 압축 해제
def extract_gz(data):
    with gzip.open(BytesIO(data), 'rb') as f:
        return f.read().decode('utf-8')

# 데이터 파싱 및 DataFrame 변환
def parse_to_dataframe(data):
    lines = data.split('\n')
    records = [line.split() for line in lines if line]
    # 데이터의 실제 컬럼 수에 맞춰 수정
    df = pd.DataFrame(records)
    return df

# 하루치 데이터 다운로드 및 DataFrame으로 변환
def fetch_daily_data(year, month, day):
    all_data = []
    for hour in range(2):
        gz_data = download_pageviews(year, month, day, hour)
        extracted_data = extract_gz(gz_data)
        df = parse_to_dataframe(extracted_data)
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# 예시 사용
year, month, day = 2024, 7, 31
daily_df = fetch_daily_data(year, month, day)



In [5]:
daily_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10799717 entries, 0 to 10799716
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       object
 1   1       object
 2   2       object
 3   3       object
dtypes: object(4)
memory usage: 329.6+ MB


In [6]:
daily_df

Unnamed: 0,0,1,2,3
0,"""""",Category:Abstract_Wikipedia,1,0
1,"""""",Category:Account_creators,1,0
2,"""""",Category:Account_creators/ar,1,0
3,"""""",Category:Account_creators/gu,1,0
4,"""""",Category:Administrators,1,0
...,...,...,...,...
10799712,zu.m,Umkholezima,2,0
10799713,zu.m,Wikipedia:Statistics,1,0
10799714,zu.m,XVideos,2,0
10799715,zu.m.b,Special:Search,1,0


In [7]:
daily_df.columns = ['Column1', 'Column2', 'Column3', 'Column4']

In [8]:
# 문자열 정규화 함수 정의
def clean_string(s):
    if isinstance(s, str):
        # 불필요한 따옴표와 빈 문자열 처리
        s = s.strip().strip("'")
        return '' if s == '""' else s
    return s

# 데이터 전처리
daily_df['Column1'] = daily_df['Column1'].apply(clean_string)
df_cleaned = daily_df[daily_df['Column1'] != '']

In [9]:
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned

Unnamed: 0,Column1,Column2,Column3,Column4
1864,aa,Category:User_es,1,0
1865,aa,Main_Page,6,0
1866,aa,Special:Log/block,1,0
1867,aa,Special:UrlShortener,3,0
1868,aa.b,Main_Page,1,0
...,...,...,...,...
10799712,zu.m,Umkholezima,2,0
10799713,zu.m,Wikipedia:Statistics,1,0
10799714,zu.m,XVideos,2,0
10799715,zu.m.b,Special:Search,1,0


In [10]:
df_cleaned.columns = ['LanguageCode', 'PageTitle', 'PageViews', 'Metadata']
df_cleaned

Unnamed: 0,LanguageCode,PageTitle,PageViews,Metadata
1864,aa,Category:User_es,1,0
1865,aa,Main_Page,6,0
1866,aa,Special:Log/block,1,0
1867,aa,Special:UrlShortener,3,0
1868,aa.b,Main_Page,1,0
...,...,...,...,...
10799712,zu.m,Umkholezima,2,0
10799713,zu.m,Wikipedia:Statistics,1,0
10799714,zu.m,XVideos,2,0
10799715,zu.m.b,Special:Search,1,0


In [15]:
# 결측치 또는 비어있는 값이 있다면 처리 (예: NaN을 0으로 대체)
df_cleaned['PageViews'] = df_cleaned['PageViews'].fillna(0)

# object 타입을 int로 변환
df_cleaned['PageViews'] = df_cleaned['PageViews'].astype(int)

In [18]:
# 페이지 유형에 따른 필터링 기준 설정
exclude_prefixes = [
    "Main_Page", "Special:", "Talk:", "User:", "User talk:", 
    "Category:", "Portal:", "File:", "Template:", "Help:", "Wikipedia:"
]

# 특정 유형의 페이지 제외
# PageTitle이 제외할 접두사를 가진 경우 제외
df_filtered = df_cleaned[~df_cleaned['PageTitle'].str.startswith(tuple(exclude_prefixes))]

In [19]:
df_filtered

Unnamed: 0,LanguageCode,PageTitle,PageViews,Metadata
1873,ab,1067,2,0
1874,ab,1157,1,0
1875,ab,1159,1,0
1876,ab,1162,1,0
1877,ab,1320,1,0
...,...,...,...,...
10799710,zu.m,Ukuvukela_kweSoweto,1,0
10799711,zu.m,Umbhoxo,2,0
10799712,zu.m,Umkholezima,2,0
10799714,zu.m,XVideos,2,0


In [20]:
import pandas as pd

# 데이터프레임 불러오기 (실제 데이터를 사용하세요)
# df = pd.read_csv('path_to_wikipedia_data.csv')

# 예시 데이터에서 페이지뷰가 10 이상인 경우만 남기기
threshold = 10
df_filtered2 = df_filtered[df_filtered['PageViews'] >= threshold]

In [21]:
# 필터링 결과 확인
print(df_filtered2.head())
print(f"Original number of rows: {len(df_filtered)}, Filtered number of rows: {len(df_filtered2)}")


     LanguageCode          PageTitle  PageViews Metadata
1880           ab               1414         10        0
1897           ab  Авикипедиа:Афорум         12        0
2155          ace            Ôn_Keue         11        0
2240        ace.m            Ôn_Keue         10        0
2380           af        C.M._Stimie         18        0
Original number of rows: 9692867, Filtered number of rows: 444217


In [22]:
df_filtered2

Unnamed: 0,LanguageCode,PageTitle,PageViews,Metadata
1880,ab,1414,10,0
1897,ab,Авикипедиа:Афорум,12,0
2155,ace,Ôn_Keue,11,0
2240,ace.m,Ôn_Keue,10,0
2380,af,C.M._Stimie,18,0
...,...,...,...,...
10799488,zh.s,飛白書勢,16,0
10799539,zh.v,愛·回家之開心速遞集數列表及故事系列,14,0
10799597,zh.voy,首页,17,0
10799618,zu,Ikhasi_Elikhulu,12,0


In [23]:
# 언어 코드별 페이지 수 확인
language_counts = df_filtered2['LanguageCode'].value_counts()

In [25]:
# 문서 수가 10 이하인 언어 코드 찾기
languages_to_remove = language_counts[language_counts <= 10].index

# 해당 언어 코드를 가진 행 제거
filtered_df3 = df_filtered2[~df_filtered2['LanguageCode'].isin(languages_to_remove)]


In [26]:
filtered_df3

Unnamed: 0,LanguageCode,PageTitle,PageViews,Metadata
7564,ar,MediaWiki:إعلانات/2,15,0
7565,ar,MediaWiki:إعلانات/3,12,0
7630,ar,آل_التنين,10,0
7779,ar,أبو_سفيان_بن_حرب,17,0
8130,ar,أسينات,11,0
...,...,...,...,...
10799316,zh.s,蒼石先生文集/卷十一,12,0
10799320,zh.s,蒼石先生文集/卷十五,10,0
10799322,zh.s,蒼石先生文集/卷十六,11,0
10799478,zh.s,雪岳遺稿,20,0


In [34]:
# 인덱스를 기본값으로 재설정하고 새로운 인덱스 부여
filtered_df_reset = filtered_df3.reset_index(drop=True)

In [35]:
filtered_df_reset

Unnamed: 0,LanguageCode,PageTitle,PageViews,Metadata
0,ar,MediaWiki:إعلانات/2,15,0
1,ar,MediaWiki:إعلانات/3,12,0
2,ar,آل_التنين,10,0
3,ar,أبو_سفيان_بن_حرب,17,0
4,ar,أسينات,11,0
...,...,...,...,...
442869,zh.s,蒼石先生文集/卷十一,12,0
442870,zh.s,蒼石先生文集/卷十五,10,0
442871,zh.s,蒼石先生文集/卷十六,11,0
442872,zh.s,雪岳遺稿,20,0


In [36]:
# PageViews가 가장 높은 30개의 제목 선택
top_30_pages = filtered_df_reset.nlargest(30, 'PageViews')

print(top_30_pages)

       LanguageCode              PageTitle  PageViews Metadata
189542           fr      Spécial:Recherche      81997        0
405107           fr      Spécial:Recherche      81310        0
364752         en.m           Simone_Biles      79720        0
6579             en                      -      58787        0
325969         en.m         Jonathan_Owens      38655        0
234030           en                      -      34618        0
145434         en.m           Simone_Biles      31178        0
368530         en.m             Sunisa_Lee      27695        0
326043         en.m          Jordan_Chiles      25307        0
342427         en.m         Michael_Phelps      22194        0
322191         en.m             Jade_Carey      17369        0
365456         en.m             Snoop_Dogg      16783        0
414404           ja                 メインページ      15227        0
106248         en.m          Kamala_Harris      13240        0
200297           ja                 メインページ      13142  