# Imports

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import re

# Extract Readings From HTML

In [None]:
# Assuming `html` contains the raw HTML you extracted earlier
def extract_table(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Find the table in the HTML (by id, class, etc.)
    table = soup.find('table', {'id': '__BVID__59'})

    # Extract the table headers
    headers = [th.get_text(strip=True) for th in table.find_all('th')]

    # Extract the rows of the table
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header row
        cols = tr.find_all('td')
        row = [col.get_text(strip=True) for col in cols]
        rows.append(row)

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=headers)
    return df

In [11]:
big_table = pd.DataFrame()

# Loop through all the files in the "data" directory
for file in os.listdir("data"):
    if file.endswith(".html"):  # Ensure we're only processing HTML files
        with open(f"data/{file}", "r") as f:
            html = f.read()
            df = extract_table(html)
            
            # Concatenate the current dataframe to the big table
            big_table = pd.concat([big_table, df], ignore_index=True)

In [12]:
big_table

Unnamed: 0,No.,วันที่,ช่วงเวลา,02t,03t,11t,12t,50t,52t,53t,...,bkp85t,19t,o63,bkp90t,bkp88t,bkp86t,bkp94t,bkp84t,bkp83t,bkp63t
0,1,2025-01-19,01:00,55.5,51.7,47.1,44.2,50.1,54.5,49.9,...,62.1,55.7,54.1,50.1,66.9,65.2,48.2,52.7,51,49.6
1,2,2025-01-19,02:00,60.3,56.6,48.1,48.1,54,60.7,55.5,...,63.8,45.9,55,52.6,63.1,64.1,51.5,55.2,52,54.7
2,3,2025-01-19,03:00,57.9,58.1,46.2,45.8,55.3,56,56.7,...,69.8,40.5,47.3,54.4,68,78.9,56.4,58.8,49,56.9
3,4,2025-01-19,04:00,55.2,55.3,40.3,43.1,48.6,55.1,45.5,...,77.6,43.4,45.2,53.2,69.1,73.7,50.8,60.2,45,56.1
4,5,2025-01-19,05:00,60.7,60.3,40.9,44.5,47.9,58.4,45.1,...,92,50.8,53.1,70.5,72,74.6,49.1,76.5,41,48.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,176,2025-01-26,08:00,44.5,44.9,40.8,39.8,48.5,43.2,49.6,...,53,37.9,42.6,52.6,52,39.7,53.1,45.1,43,45.8
730,177,2025-01-26,09:00,48.6,48.3,46,44.4,47.2,48.9,51.8,...,53.3,45.7,47.7,35.6,51,37.4,53,43.1,51,49.8
731,178,2025-01-26,10:00,47.7,47.3,47.7,46.8,49.4,47,52.6,...,55.1,44.9,48.9,32.6,49,38.7,52,52.5,52,46.2
732,179,2025-01-26,11:00,49.7,49.5,53.9,53.5,53.5,47.7,54.9,...,56.5,38.1,53.9,36.4,50.7,39.9,48.5,39.7,49,46.9


In [22]:
df = big_table.copy()
df["No."] = df["No."].astype(int)
df = df.set_index("No.")
df.sort_index(inplace=True)
df

Unnamed: 0_level_0,วันที่,ช่วงเวลา,02t,03t,11t,12t,50t,52t,53t,54t,...,bkp85t,19t,o63,bkp90t,bkp88t,bkp86t,bkp94t,bkp84t,bkp83t,bkp63t
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2025-01-19,01:00,55.5,51.7,47.1,44.2,50.1,54.5,49.9,44.9,...,62.1,55.7,54.1,50.1,66.9,65.2,48.2,52.7,51,49.6
2,2025-01-19,02:00,60.3,56.6,48.1,48.1,54,60.7,55.5,55.1,...,63.8,45.9,55,52.6,63.1,64.1,51.5,55.2,52,54.7
3,2025-01-19,03:00,57.9,58.1,46.2,45.8,55.3,56,56.7,46.8,...,69.8,40.5,47.3,54.4,68,78.9,56.4,58.8,49,56.9
4,2025-01-19,04:00,55.2,55.3,40.3,43.1,48.6,55.1,45.5,40.5,...,77.6,43.4,45.2,53.2,69.1,73.7,50.8,60.2,45,56.1
5,2025-01-19,05:00,60.7,60.3,40.9,44.5,47.9,58.4,45.1,41.4,...,92,50.8,53.1,70.5,72,74.6,49.1,76.5,41,48.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730,2025-02-18,10:00,25.5,27.2,24.9,24.9,28.7,25,29.9,28.7,...,26.3,24.3,24.9,27.1,21,20.2,17.1,52.6,25,21.9
731,2025-02-18,11:00,28.1,26.7,21.1,21.9,25.4,27.5,24.2,27.3,...,27.4,23.5,22.5,19.8,28.8,16.6,17,22.2,18,29.5
732,2025-02-18,12:00,,24.3,20.2,20.9,25,25.1,24.3,24.1,...,26.5,23.7,20.6,,25.1,35.4,16.1,22.1,20,28.1
733,2025-02-18,13:00,,,19.7,19.2,22.8,27.9,22.9,22.5,...,39.4,21.8,19.7,32.6,35.8,25.2,25.7,21,24,34.5


In [23]:
df.to_csv("data.csv")

# Get all the weather station address

In [None]:
# Read the HTML file
with open('data/27.html', 'r', encoding='utf-8') as file:
      content = file.read()

soup = BeautifulSoup(content, 'html.parser')

# Find all <text> tags inside <g> elements
text_elements = soup.find_all("text")

# Updated regex to capture station codes like "bkp100t", "22t", etc.
pattern = re.compile(r"([a-zA-Z]*\d+[a-zA-Z]*) \((.+)\)")

data = []

for text in text_elements:
    match = pattern.search(text.text.strip())
    if match:
        station_code = match.group(1)
        thai_address = match.group(2)
        data.append((station_code, thai_address))

# Output the extracted list
print(data)


[('02t', 'มหาวิทยาลัยราชภัฏบ้านสมเด็จเจ้าพระยา แขวงหิรัญรูจี เขตธนบุรี, กรุงเทพฯ'), ('03t', 'ริมถนนทางหลวงหมายเลข 3902  ริมถนนกาญจนาภิเษก เขตบางขุนเทียน, กรุงเทพฯ'), ('11t', 'การเคหะชุมชนห้วยขวาง  แขวงดินแดง เขตดินแดง, กรุงเทพฯ'), ('12t', 'โรงเรียนนนทรีวิทยา  แขวงช่องนนทรี เขตยานนาวา, กรุงเทพฯ'), ('50t', 'โรงพยาบาลจุฬาลงกรณ์ ริมถนนพระราม 4 เขตปทุมวัน, กรุงเทพฯ'), ('52t', 'การไฟฟ้าย่อยธนบุรี  ริมถนนอินทรพิทักษ์ เขตธนบุรี, กรุงเทพฯ'), ('53t', 'สถานีตำรวจนครบาลโชคชัย  ริมถนนลาดพร้าว เขตวังทองหลาง, กรุงเทพฯ'), ('54t', 'การเคหะชุมชนดินแดง  ริมถนนดินแดง เขตดินแดง, กรุงเทพฯ'), ('59t', 'กรมประชาสัมพันธ์ แขวงพญาไท เขตพญาไท, กรุงเทพฯ'), ('61t', 'โรงเรียนบดินทรเดชา (สิงห์ สิงหเสนี)  แขวงพลับพลา เขตวังทองหลาง, กรุงเทพฯ'), ('bkp100t', 'สำนักงานเขตบึงกุ่ม แขวงคลองกุ่ม เขตบึงกุ่ม'), ('bkp101t', 'สำนักงานเขตคลองสามวา เขตคลองสามวา, กรุงเทพฯ'), ('bkp102t', 'สำนักงานเขตจอมทอง เขตจอมทอง, กรุงเทพฯ'), ('bkp103t', 'สำนักงานเขตบางพลัด ริมถนนจรัญสนิทวงศ์ เขตบางพลัด, กรุงเทพฯ'), ('bkp104t', 'สำนักงานเขตบางแค เข

In [58]:
addr_df = pd.DataFrame({"Code": [d[0] for d in data], "Address": [d[1] for d in data]})
addr_df

Unnamed: 0,Code,Address
0,02t,มหาวิทยาลัยราชภัฏบ้านสมเด็จเจ้าพระยา แขวงหิรัญ...
1,03t,ริมถนนทางหลวงหมายเลข 3902 ริมถนนกาญจนาภิเษก เ...
2,11t,"การเคหะชุมชนห้วยขวาง แขวงดินแดง เขตดินแดง, กร..."
3,12t,"โรงเรียนนนทรีวิทยา แขวงช่องนนทรี เขตยานนาวา, ..."
4,50t,"โรงพยาบาลจุฬาลงกรณ์ ริมถนนพระราม 4 เขตปทุมวัน,..."
...,...,...
87,bkp86t,ถนนพุทธมณฑลสาย 1 ตัดกับถนนบรมราชชนนี ริมถนนพุท...
88,bkp94t,"สำนักงานเขตคลองเตย แขวงคลองเตย เขตคลองเตย, กรุ..."
89,bkp84t,"สี่แยกท่าพระ ริมถนนแยกท่าพระ เขตบางกอกใหญ่, กร..."
90,bkp83t,ห้องสมุดใต้สะพานสมเด็จพระเจ้าตากสิน ริมถนนเจริ...


In [56]:
df = pd.read_csv("data.csv", index_col="No.")
df

Unnamed: 0_level_0,วันที่,ช่วงเวลา,02t,03t,11t,12t,50t,52t,53t,54t,...,bkp85t,19t,o63,bkp90t,bkp88t,bkp86t,bkp94t,bkp84t,bkp83t,bkp63t
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2025-01-19,01:00,55.5,51.7,47.1,44.2,50.1,54.5,49.9,44.9,...,62.1,55.7,54.1,50.1,66.9,65.2,48.2,52.7,51.0,49.6
2,2025-01-19,02:00,60.3,56.6,48.1,48.1,54.0,60.7,55.5,55.1,...,63.8,45.9,55.0,52.6,63.1,64.1,51.5,55.2,52.0,54.7
3,2025-01-19,03:00,57.9,58.1,46.2,45.8,55.3,56.0,56.7,46.8,...,69.8,40.5,47.3,54.4,68.0,78.9,56.4,58.8,49.0,56.9
4,2025-01-19,04:00,55.2,55.3,40.3,43.1,48.6,55.1,45.5,40.5,...,77.6,43.4,45.2,53.2,69.1,73.7,50.8,60.2,45.0,56.1
5,2025-01-19,05:00,60.7,60.3,40.9,44.5,47.9,58.4,45.1,41.4,...,92.0,50.8,53.1,70.5,72.0,74.6,49.1,76.5,41.0,48.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730,2025-02-18,10:00,25.5,27.2,24.9,24.9,28.7,25.0,29.9,28.7,...,26.3,24.3,24.9,27.1,21.0,20.2,17.1,52.6,25.0,21.9
731,2025-02-18,11:00,28.1,26.7,21.1,21.9,25.4,27.5,24.2,27.3,...,27.4,23.5,22.5,19.8,28.8,16.6,17.0,22.2,18.0,29.5
732,2025-02-18,12:00,,24.3,20.2,20.9,25.0,25.1,24.3,24.1,...,26.5,23.7,20.6,,25.1,35.4,16.1,22.1,20.0,28.1
733,2025-02-18,13:00,,,19.7,19.2,22.8,27.9,22.9,22.5,...,39.4,21.8,19.7,32.6,35.8,25.2,25.7,21.0,24.0,34.5


In [60]:
set(df.columns) - set(addr_df["Code"])

{'ช่วงเวลา', 'วันที่'}

In [59]:
addr_df.to_csv("addr.csv")

# GG Maps

In [62]:
import pandas as pd
import requests
import time

def get_lat_long(address):
    """Fetch latitude and longitude using Nominatim (OpenStreetMap)."""
    # URL for Nominatim geocoding API
    url = f'https://nominatim.openstreetmap.org/search?q={address}&format=json'
    
    # Make the request to the Nominatim API
    response = requests.get(url)
    
    # If response is successful and results are found
    if response.status_code == 200:
        data = response.json()
        if data:
            lat = data[0]['lat']
            lon = data[0]['lon']
            return lat, lon
    return None, None  # Return None if not found or error occurs

# Example DataFrame with addresses
df = pd.DataFrame({'address': ['New York, USA', 'Eiffel Tower, Paris', 'Invalid Place']})

# Apply the geocoding function and add latitude and longitude columns
df[['latitude', 'longitude']] = df['address'].apply(lambda x: pd.Series(get_lat_long(x)))

# Add a small delay to comply with the rate limit (1 request per second)
time.sleep(1)

# Display the updated DataFrame
print(df)


               address latitude longitude
0        New York, USA     None      None
1  Eiffel Tower, Paris     None      None
2        Invalid Place     None      None
