<a href="https://colab.research.google.com/github/Kensuzuki95/AIF360/blob/master/Web_Scraping_Store_Info.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tomods Store Location Info
* Homapage Store Location Search Page: https://shop.tomods.jp/all

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [22]:
url = "https://shop.tomods.jp/all"

# Send a GET request to the website
response = requests.get(url)

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find all the buttons with the specified class and extract the store names
buttons = soup.find_all('button', class_='font-bold ml-1.5 text-left text-[14px] hover:opacity-50 transition-opacity')

# Find all the divs with the specified class and extract the store hours and store address
store_info = soup.find_all('div', class_='text-sm mb-3')

store_data = []
for i in range(len(buttons)):
    store_name = buttons[i].text
    store_hours = store_info[i].find('div', class_='mb-1').text
    store_address = store_info[i].find_all('div')[1].text
    store_data.append([store_name, store_hours, store_address])

# Create a pandas DataFrame from the scraped data
df = pd.DataFrame(store_data, columns=['store_name', 'store_hours', 'store_address'])

# Print the DataFrame
#df.head()

In [23]:
# Filter rows based on the condition: store_name contains 'トモズ'
df = df[df['store_name'].str.contains('トモズ')]

# Reset the DataFrame index after filtering
df = df.reset_index(drop=True)
df

Unnamed: 0,store_name,store_hours,store_address
0,薬局トモズ 成城コルティ店,営業時間外 - 営業開始時間 9:00,東京都世田谷区成城6-5-34 成城コルティ 3F
1,トモズ 白金高輪店,営業時間外 - 営業開始時間 9:00,東京都港区高輪 1丁目3-1 プレミストタワー白金高輪 1F
2,トモズ アークヒルズ店,定休日,東京都港区赤坂1-12-32アーク森ビル 2F
3,トモズ 青葉台店,営業時間外 - 営業開始時間 10:00,神奈川県横浜市青葉区青葉台1-6-14エキニア青葉台
4,トモズ 青葉台東急スクエア店,営業時間外 - 営業開始時間 10:00,神奈川県横浜市青葉区青葉台2-1-1青葉台東急スクエアNorth-1　1·2F
...,...,...,...
222,トモズ ららぽーと海老名店,営業時間外 - 営業開始時間 10:00,神奈川県海老名市扇町１３−１ららぽーと海老名 1F
223,トモズ ららぽーと湘南平塚店,営業時間外 - 営業開始時間 10:00,神奈川県平塚市天沼１０−１ららぽーと湘南平塚1F
224,薬局トモズ 両国店,定休日,東京都墨田区横網1-10-12 高安ビル
225,トモズ 六本木ヒルズ店,営業時間外 - 営業開始時間 11:00,東京都港区六本木6-10-3六本木ヒルズ ウェストウォーク1F


In [24]:
df.to_excel('/content/drive/MyDrive/EY Recruitment Documents/franchise_info/tomods.xlsx', index=False)

# Ito-Yokado Store Location Info
* Store information page link starts with https://stores.itoyokado.co.jp/XXX (= place 3 digits number instead of "XXX" to access random store pages)
  * E.g., イトーヨーカドー 武蔵小金井店 = https://stores.itoyokado.co.jp/242

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_store_details(store_number):
    # Construct the URL
    url = f"https://stores.itoyokado.co.jp/{store_number:03d}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the page exists
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Scrape the store name
        store_name_element = soup.find("span", class_="LocationName-brand")
        store_name = store_name_element.text if store_name_element else "N/A"

        # Scrape the store location
        store_location_element = soup.find("address", class_="Address-content")
        store_location = store_location_element.text if store_location_element else "N/A"

        # Scrape the store hours
        store_hours_element = soup.find("div", class_="c-hours-details")
        store_hours = store_hours_element.text.strip() if store_hours_element else "N/A"

        # Return the scraped details as a dictionary
        return {
            "Store Name": store_name,
            "Store Location": store_location,
            "Store Hours": store_hours
        }
    else:
        return None

# Scrape store details for store numbers 001 to 999
results = []
for store_number in range(1, 1000):
    store_details = scrape_store_details(store_number)
    if store_details is not None:
        results.append(store_details)

# Create a data frame from the results
df = pd.DataFrame(results)

# Print the data frame
df

          Store Name                   Store Location       Store Hours
0       イトーヨーカドー 高砂店          〒125-0054東京都葛飾区高砂3-12-5  全曜日10:00 - 22:00
1        イトーヨーカドー 柏店              〒277-0005千葉県柏市柏2-15  全曜日10:00 - 22:00
2      イトーヨーカドー 上板橋店         〒174-0071東京都板橋区常盤台4-26-1  全曜日10:00 - 22:00
3      イトーヨーカドー 相模原店      〒252-0313神奈川県相模原市南区松が枝町17-1  全曜日10:00 - 22:00
4       イトーヨーカドー 浦和店      〒330-0062埼玉県さいたま市浦和区仲町1-7-1  全曜日10:00 - 21:00
..               ...                              ...               ...
121  イトーヨーカドー 食品館川越店         〒350-0043埼玉県川越市新富町1-20-1  全曜日10:00 - 22:00
122     イトーヨーカドー 新田店        〒340-0053埼玉県草加市旭町六丁目15-30  全曜日10:00 - 21:00
123     イトーヨーカドー 朝霞店         〒351-0005埼玉県朝霞市根岸台3-20－1  全曜日09:00 - 20:00
124    イトーヨーカドー 西川口店          〒332-0021埼玉県川口市西川口2-3-5  全曜日10:00 - 21:00
125   イトーヨーカドー 丸大新潟店  〒951-8067新潟県新潟市中央区本町通り六番町1122-1  全曜日09:00 - 21:00

[126 rows x 3 columns]


In [21]:
df.to_excel('/content/drive/MyDrive/EY Recruitment Documents/franchise_info/ito_yokado.xlsx', index=False)

# Matsumoto-Kiyoshi Store Location Info

In [25]:
def scrape_store_details(store_number):
    # Construct the URL
    url = f"https://www.matsukiyo.co.jp/map?kid={store_number:08d}"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Scrape the store name
        store_name_element = soup.find("div", class_="storeTxt").find("h2")
        store_name = store_name_element.text.strip() if store_name_element else "N/A"

        # Scrape the store location
        store_location_element = soup.find("li", class_="iconAdd")
        store_location = store_location_element.text.strip().replace("Googleマップで見る", "") if store_location_element else "N/A"

        # Scrape the store hours
        store_hours_element = soup.find("td")
        store_hours = store_hours_element.text.strip() if store_hours_element else "N/A"

        # Return the scraped details as a dictionary
        return {
            "Store Name": store_name,
            "Store Location": store_location,
            "Store Hours": store_hours
        }
    else:
        return None

# Scrape store details for store numbers 1000000 to 3000000
results = []
for store_number in range(1000000, 3000001):
    store_details = scrape_store_details(store_number)
    if store_details is not None:
        results.append(store_details)

# Create a data frame from the results
df = pd.DataFrame(results)
df

KeyboardInterrupt: ignored

In [26]:
# Filter rows based on the condition: store_name contains 'トモズ'
df = df[df['store_name'].str.contains('マツモトキヨシ')]

# Reset the DataFrame index after filtering
df = df.reset_index(drop=True)
df

Unnamed: 0,store_name,store_hours,store_address


In [28]:
# Save the data frame to a CSV file in the specified folder
df.to_csv('/content/drive/MyDrive/EY Recruitment Documents/franchise_info/matsumoto-kiyoshi.xlsx', index=False)

# Merge the Three Dataframe

In [None]:
tomods = pd.read_excel('/content/drive/MyDrive/EY Recruitment Documents/franchise_info/tomods.xlsx')
ito_yokadado = pd.read_excel('/content/drive/MyDrive/EY Recruitment Documents/franchise_info/ito_yokado.xlsx')
matsumoto_kiyoshi = pd.read_excel('/content/drive/MyDrive/EY Recruitment Documents/franchise_info/matsumoto_kiyoshi.xlsx')