# Part 1: Web Scraping and Database Storage 

In [None]:
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Mapping short day names to full names
day_mapping = {
    "Mon": "Monday", "Tue": "Tuesday", "Wed": "Wednesday", "Thu": "Thursday",
    "Fri": "Friday", "Sat": "Saturday", "Sun": "Sunday"
}

# Function to normalize day names
def normalize_day(day):
    if day is None:
        return None
    day = day.strip().capitalize()  # Normalize capitalization
    return day_mapping.get(day[:3], day)  # Convert short form to full name if exists

# Function to normalize time format
def normalize_time(time_str):
    if time_str is None:
        return None
    time_str = time_str.strip().lower()  # Normalize case
    time_str = re.sub(r'(\d)(am|pm)', r'\1 \2', time_str)  # Ensure space before AM/PM

    try:
        return pd.to_datetime(time_str, format="%I:%M %p").strftime("%I:%M %p")
    except ValueError:
        try:
            return pd.to_datetime(time_str, format="%I %p").strftime("%I:%M %p")
        except ValueError:
            return None

# Function to parse working hours
def parse_working_hours(hours_str):
    if pd.isna(hours_str) or not isinstance(hours_str, str):
        return pd.Series([None, None, None, None])

    # Extract possible day range
    day_match = re.search(r'([A-Za-z]+)\s*[-–]\s*([A-Za-z]+)', hours_str)
    if day_match:
        work_day_start, work_day_end = day_match.groups()
    else:
        single_day_match = re.search(r'([A-Za-z]+),', hours_str)
        work_day_start = single_day_match.group(1) if single_day_match else None
        work_day_end = work_day_start  # Assume same start & end if only one is given

    # Extract time range
    time_match = re.findall(r'(\d{1,2}(:\d{2})?\s*[APMapm]+)', hours_str)
    start_time, end_time = None, None
    if len(time_match) >= 2:
        start_time, end_time = time_match[0][0], time_match[1][0]
    elif len(time_match) == 1:  # If only one time is provided, assume same start & end
        start_time = time_match[0][0]
        end_time = start_time

    # Normalize day names and time format
    work_day_start = normalize_day(work_day_start)
    work_day_end = normalize_day(work_day_end) if work_day_end else work_day_start
    start_time = normalize_time(start_time)
    end_time = normalize_time(end_time)

    return pd.Series([work_day_start, work_day_end, start_time, end_time])

# Web scraping function
def scrape_subway_data():
    """
    Extracts Subway store data from the website, filters for Kuala Lumpur outlets,
    and returns a Pandas DataFrame with structured working hours.
    """
    # Configure Selenium
    options = Options()
    options.add_argument("--headless")  # Run in background
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    service = Service(ChromeDriverManager().install())

    # Initialize browser
    driver = webdriver.Chrome(service=service, options=options)

    # Open website
    url = "https://subway.com.my/find-a-subway"
    driver.get(url)
    time.sleep(5)  # Wait for JavaScript to load

    # Extract store data
    outlets = driver.find_elements(By.CLASS_NAME, "fp_listitem")

    data = []
    for outlet in outlets:
        try:
            # Extract data: name, address, hours
            name = outlet.find_element(By.TAG_NAME, "h4").text.strip()
            address = outlet.find_element(By.CLASS_NAME, "infoboxcontent").find_element(By.TAG_NAME, "p").text.strip()
            
            # Filter only Kuala Lumpur outlets
            if "kuala lumpur" not in address.lower():
                continue  
            
            # Extract opening hours (if available)
            paragraphs = outlet.find_elements(By.TAG_NAME, "p")
            hours = paragraphs[2].text.strip() if len(paragraphs) > 2 else "Unknown"
            
            # Parse working hours
            work_day_start, work_day_end, start_time, end_time = parse_working_hours(hours)
            print(f"{hours} -> ({work_day_start}, {work_day_end}, {start_time}, {end_time})")

            # latitude = outlet.get_attribute("data-latitude")
            # longitude = outlet.get_attribute("data-longitude")
            latitude = None
            longitude = None

            # Store data
            data.append({
                "name": name,
                "address": address,
                "work_day_start": work_day_start,
                "work_day_end": work_day_end,
                "start_time": start_time,
                "end_time": end_time,
                "latitude": latitude,
                "longitude": longitude
            })

        except Exception as e:
            print(f"Error: {e}")

    # Close the browser
    driver.quit()

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# Execute web scraping function
df = scrape_subway_data()
df.head()

Monday - Sunday, 8:00 AM - 8:00 PM -> (Monday, Sunday, 08:00 AM, 08:00 PM)
Monday - Saturday, 8:00 AM – 9:00PM -> (Monday, Saturday, 08:00 AM, 09:00 PM)
Monday - Saturday, 8:00 AM – 8:30PM -> (Monday, Saturday, 08:00 AM, 08:30 PM)
Monday - Sunday, 10:00 AM - 10:00 PM -> (Monday, Sunday, 10:00 AM, 10:00 PM)
Monday - Sunday, 8:00 AM - 10:00 PM -> (Monday, Sunday, 08:00 AM, 10:00 PM)
Monday - Sunday, 10:00 AM - 10:00 PM -> (Monday, Sunday, 10:00 AM, 10:00 PM)
Monday - Saturday, 8:00 AM – 9:00PM -> (Monday, Saturday, 08:00 AM, 09:00 PM)
Monday - Friday, 8:00 AM – 6:30PM -> (Monday, Friday, 08:00 AM, 06:30 PM)
Monday - Sunday, 9:30 AM - 9:30 PM -> (Monday, Sunday, 09:30 AM, 09:30 PM)
Monday - Sunday, 8:30 AM - 9:00 PM -> (Monday, Sunday, 08:30 AM, 09:00 PM)
Monday - Friday, 8:00 AM – 9:00PM -> (Monday, Friday, 08:00 AM, 09:00 PM)
Monday - Sunday, 10:15 AM - 9:30 PM -> (Monday, Sunday, 10:15 AM, 09:30 PM)
Monday - Sunday, 8:00 AM - 10:00 PM -> (Monday, Sunday, 08:00 AM, 10:00 PM)
Monday - Su

Unnamed: 0,name,address,work_day_start,work_day_end,start_time,end_time,latitude,longitude
0,Subway Menara UOA Bangsar,"Jalan Bangsar Utama 1, Unit 1-2-G, Menara UOA ...",Monday,Sunday,08:00 AM,08:00 PM,,
1,Subway Jln Pinang,"G9, Wisma UOA II, 19, Jalan Pinang, Kuala Lump...",Monday,Saturday,08:00 AM,09:00 PM,,
2,Subway UOA Damansara,"Unit 50-G-5, Ground Floor, Wisma UOA Damansara...",Monday,Saturday,08:00 AM,08:30 PM,,
3,Subway Mont Kiara,"E-01-16 ,Block E, Plaza Mont Kiara, 2 Jalan Ki...",Monday,Sunday,10:00 AM,10:00 PM,,
4,Subway Avenue K,"Lot UC-8 & 9, Upper Concourse Level, Avenue K,...",Monday,Sunday,08:00 AM,10:00 PM,,


In [3]:
import sqlite3

def save_to_database(df, db_name="subway.db"):
    """
    Saves the DataFrame to a SQLite database.

    Args:
        df (pd.DataFrame): The DataFrame containing store data.
        db_name (str): Name of the database file.
    """
    with sqlite3.connect(db_name) as conn:
        cursor = conn.cursor()

        # Create table if not exists
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS outlets (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,
            address TEXT,
            work_day_start TEXT,
            work_day_end TEXT,
            start_time TEXT,
            end_time TEXT,
            latitude REAL,
            longitude REAL
        )
        """)

        # Insert data
        df.to_sql("outlets", conn, if_exists="append", index=False)

        print(f"✅ Data saved to {db_name}")

if not df.empty:
    save_to_database(df)
else:
    print("❌ No Kuala Lumpur outlets found.")

✅ Data saved to subway.db


# Part 2: Geocoding

In [6]:
import googlemaps
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Replace with your Google Maps API Key
GOOGLE_MAP_API = os.getenv('GOOGLE_MAP_API')

# Initialize Google Maps Geocoder
gmaps = googlemaps.Client(key=GOOGLE_MAP_API)

def geocode_address(address, retries=3):
    """
    Retrieves geographical coordinates (latitude, longitude) for a given address using Google Maps.
    
    Args:
        address (str): The address to geocode.
        retries (int): Number of retry attempts for failed lookups.
    
    Returns:
        tuple: (latitude, longitude) or (None, None) if failed.
    """
    for attempt in range(retries):
        try:
            results = gmaps.geocode(address)
            if results:
                lat, lon = results[0]['geometry']['location']['lat'], results[0]['geometry']['location']['lng']
                print(f"✅ Geocoded: {address} → ({lat}, {lon})")
                return lat, lon
            else:
                print(f"❌ No geocode found for: {address}")
        except Exception as e:
            print(f"⚠️ Error: {e} | Retrying ({attempt + 1}/{retries})...")
            time.sleep(2)

    return None, None  # If all retries fail

def update_database_with_geocodes(db_name="subway.db"):
    """
    Updates the SQLite database with latitude & longitude for outlets with missing coordinates.
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()

    # Fetch outlets where latitude or longitude is missing
    cursor.execute("SELECT id, address FROM outlets WHERE latitude IS NULL OR longitude IS NULL")
    missing_outlets = cursor.fetchall()

    if not missing_outlets:
        print("✅ All outlets already have geocodes.")
        conn.close()
        return

    print(f"🔍 Found {len(missing_outlets)} outlets with missing coordinates. Updating...")
    for outlet_id, address in missing_outlets:
        lat, lon = geocode_address(address)

        if lat and lon:
            cursor.execute("UPDATE outlets SET latitude = ?, longitude = ? WHERE id = ?", (lat, lon, outlet_id))

    conn.commit()
    cursor.close()
    conn.close()
    print("🚀 Database updated with geocodes.")

# Update missing geocodes
update_database_with_geocodes()

🔍 Found 59 outlets with missing coordinates. Updating...
✅ Geocoded: Jalan Bangsar Utama 1, Unit 1-2-G, Menara UOA Bangsar, Kuala Lumpur, 59000 → (3.126969, 101.6768848)
✅ Geocoded: G9, Wisma UOA II, 19, Jalan Pinang, Kuala Lumpur, 50450 → (3.1525875, 101.712256)
✅ Geocoded: Unit 50-G-5, Ground Floor, Wisma UOA Damansara, No. 50, Jalan Dungun, Kuala Lumpur, 50490 → (3.1517288, 101.6660061)
✅ Geocoded: E-01-16 ,Block E, Plaza Mont Kiara, 2 Jalan Kiara, Mont Kiara, Kuala Lumpur, 50480 → (3.1658129, 101.6510419)
✅ Geocoded: Lot UC-8 & 9, Upper Concourse Level, Avenue K, No. 156, Jalan Ampang, Kuala Lumpur, 50450 → (3.159418, 101.7134125)
✅ Geocoded: LG-08A, Berjaya Times Square, No. 1, Jalan Imbi, Kuala Lumpur, 55100 → (3.1425177, 101.7103847)
✅ Geocoded: Ground Floor, Unit 01-02, Cap Square Tower, 10 Jalan Munshi Abdullah, Kuala Lumpur, 50100 → (3.1542071, 101.6987835)
✅ Geocoded: Faculty of Dentistry, Level 1, Block B, University of Malaya, Kuala Lumpur, 50603 → (3.1219268, 101.6569942)

In [7]:
# Print after updating geocodes
def print_sample_outlets(db_name="subway.db"):
    """
    Prints a few sample records from the database before and after updating geocodes.
    """
    conn = sqlite3.connect(db_name)
    df = pd.read_sql_query("SELECT * FROM outlets", conn)
    conn.close()

    print("\n🔍 Sample Outlets from Database:")
    display(df.head())
    return df

df_updated = print_sample_outlets()


🔍 Sample Outlets from Database:


Unnamed: 0,id,name,address,work_day_start,work_day_end,start_time,end_time,latitude,longitude
0,1,Subway Menara UOA Bangsar,"Jalan Bangsar Utama 1, Unit 1-2-G, Menara UOA ...",Monday,Sunday,08:00 AM,08:00 PM,3.126969,101.676885
1,2,Subway Jln Pinang,"G9, Wisma UOA II, 19, Jalan Pinang, Kuala Lump...",Monday,Saturday,08:00 AM,09:00 PM,3.152588,101.712256
2,3,Subway UOA Damansara,"Unit 50-G-5, Ground Floor, Wisma UOA Damansara...",Monday,Saturday,08:00 AM,08:30 PM,3.151729,101.666006
3,4,Subway Mont Kiara,"E-01-16 ,Block E, Plaza Mont Kiara, 2 Jalan Ki...",Monday,Sunday,10:00 AM,10:00 PM,3.165813,101.651042
4,5,Subway Avenue K,"Lot UC-8 & 9, Upper Concourse Level, Avenue K,...",Monday,Sunday,08:00 AM,10:00 PM,3.159418,101.713413
