# 実行環境の確認

Google Colaboratory。

In [148]:
!lsb_release -a

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.3 LTS
Release:	22.04
Codename:	jammy


In [149]:
!python --version

Python 3.10.12


In [150]:
!pip list | grep -e "lxml" -e "html5lib" -e "beautifulsoup4" -e "pandas" -e "gspread" -e "oauth2client"

beautifulsoup4                   4.11.2
geopandas                        0.13.2
gspread                          3.4.2
gspread-dataframe                3.3.1
html5lib                         1.1
lxml                             4.9.4
oauth2client                     4.1.3
pandas                           1.5.3
pandas-datareader                0.10.0
pandas-gbq                       0.19.2
pandas-stubs                     1.5.3.230304
sklearn-pandas                   2.2.0


# コード部分

## スクレイピング・CSV出力

In [151]:
%cd /content/drive/MyDrive/repositories/pokedex-records-collector
%mkdir -p output

/content/drive/MyDrive/repositories/pokedex-records-collector


In [152]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [153]:
BASE_URL = "https://wiki.xn--rckteqa2e.com/wiki/" # ポケモンWiki
OUTPUT_CSV_PATH = "/content/drive/MyDrive/repositories/pokedex-records-collector/output/collection_result.csv"

In [154]:
r = requests.get(BASE_URL+"ポケモン一覧")
whole_soup = BeautifulSoup(r.content, "html.parser") # 標準出力時の文字化け回避のため、contentを指定
# URLに直接アクセスできるように、URLを絶対パス形式に置換しておく
pokedex_soup = BeautifulSoup(str(whole_soup.select_one("table[class='bluetable c sortable']"))
  .replace("a href=\"/wiki/", f"a href=\"{BASE_URL}"), "html.parser"
)
pokedex_records = pokedex_soup.select("tr")[1:] # テーブルのヘッダー部分は除外


collection_results = []
for record in pokedex_records:
  pokedex_record_soup = BeautifulSoup(str(record), "html.parser")
  td_tags = pokedex_record_soup.select("td")

  number = td_tags[0].text.rstrip().lstrip("0")
  name = td_tags[1].text.rstrip()
  detail_url = BeautifulSoup(str(td_tags[1]), "html.parser").select_one("a").get("href")
  if number == "":
    continue

  collection_results.append((number, name, detail_url))
print("Finished scraping.")
print()


df = pd.DataFrame(collection_results, columns=["図鑑No.", "名前", "詳細ページ"])
df.insert(2, "登録済み？", "FALSE")
df.insert(4, "備考", "")
print("Finished preprocessing.")
display(df)
print()


# encodingパラメータ: Excel表示時の文字化け防止
df.to_csv(OUTPUT_CSV_PATH, encoding="utf_8_sig", index=False)
print("Finished writing to csv.")

Finished scraping.

Finished preprocessing.


Unnamed: 0,図鑑No.,名前,登録済み？,詳細ページ,備考
0,1,フシギダネ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%95%...,
1,2,フシギソウ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%95%...,
2,3,フシギバナ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%95%...,
3,4,ヒトカゲ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%92%...,
4,5,リザード,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%AA%...,
...,...,...,...,...,...
1107,1020,ウガツホムラ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%82%A6%...,
1108,1021,タケルライコ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%82%BF%...,
1109,1022,テツノイワオ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%86%...,
1110,1023,テツノカシラ,FALSE,https://wiki.xn--rckteqa2e.com/wiki/%E3%83%86%...,



Finished writing to csv.
