In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Function to get coordinates from place name
def get_coordinates(place_name):
    base_url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": place_name,
        "format": "json",
        "limit": 1
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    if data:
        # Extract latitude and longitude from the response
        lat = data[0]["lat"]
        lon = data[0]["lon"]
        return float(lat), float(lon)
    else:
        return None,None

# Function to format polygon points as a string in the required format
def format_polygon_points(polygon_points):
    formatted_points = ", ".join([f"{lon} {lat}" for lat, lon in polygon_points])
    return f"POLYGON (({formatted_points}))"

# Function to get polygon points from OpenStreetMap
def get_polygon_points(place_name):
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json];
    // Query for the place name
    area[name="{place_name}"];
    // Output the geometry of the area
    out geom;
    """
    response = requests.post(overpass_url, data=overpass_query)
    data = response.json()
    if 'elements' in data and len(data['elements']) > 0:
        element = data['elements'][0]
        if 'type' in element and element['type'] == 'way':
            # Extract polygon points
            polygon_points = [(node['lat'], node['lon']) for node in element['geometry']]
            return format_polygon_points(polygon_points)
    return None


In [None]:
# Scrape the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Metro_Manila"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Find the table containing major shopping centers information
major_shopping_centers_table = soup.find("span", id="Major_shopping_centers").find_next("table", class_="wikitable sortable")

# Find the table containing community malls information
community_malls_table = soup.find("span", id="Community_malls").find_next("table", class_="wikitable sortable")

# Find the table containing lifestyle malls information
lifestyle_malls_table = soup.find("span", id="Lifestyle_malls").find_next("table", class_="wikitable sortable")

In [None]:
# Extract data from major shopping centers table
major_shopping_centers_data = []
for row in major_shopping_centers_table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if cells:
        name = cells[0].text.strip()
        location = cells[2].text.strip()
        developer = cells[3].text.strip().split("[")[0].strip()
        
        major_shopping_centers_data.append((name, location, developer))

# Extract data from community malls table
community_mall_data = []
for row in community_malls_table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if cells:
        name = cells[0].text.strip()
        location = cells[1].text.strip()
        developer = cells[2].text.strip().split("[")[0].strip()
        
        community_mall_data.append((name, location, developer))

# Extract data from lifestyle malls table
lifestyle_mall_data = []
for row in lifestyle_malls_table.find_all("tr")[1:]:  # Skip header row
    cells = row.find_all("td")
    if cells:
        name = cells[0].text.strip()
        location = cells[1].text.strip()
        developer = cells[2].text.strip().split("[")[0].strip()
    
        lifestyle_mall_data.append((name, location, developer))

# Prep data for major shopping center

cities = ["Manila", "Caloocan", "Las Piñas", "Makati", "Malabon", "Mandaluyong", "Marikina", 
          "Muntinlupa", "Navotas", "Parañaque", "Pasay", "Pasig", "Quezon City", "San Juan", "Taguig", "Valenzuela"]

developers = ["Ayala Land", "SM Prime Holdings", "Robinsons Land", "Filinvest Land", "Megaworld Corporation"]

df_major_shopping_centers = pd.DataFrame(major_shopping_centers_data, columns=['mall_name','location','developer'])
df_community_mall = pd.DataFrame(community_mall_data, columns=['mall_name','location','developer'])
df_lifestyle_mall = pd.DataFrame(lifestyle_mall_data, columns=['mall_name','location','developer'])

df = pd.concat([df_major_shopping_centers, df_community_mall, df_lifestyle_mall], ignore_index=True)

# city filter
for city in cities:
    mask = df['location'].str.contains(city, case=False)
    df.loc[mask, 'location'] = city

# check if it is major corp
df['is_major_corp'] = np.where(df['developer'].isin(developers), True, False)

df
    

In [None]:
for index, mall_name in enumerate(df['mall_name']):
    polygon_string = get_polygon_points(mall_name)
    if polygon_string:
        df.loc[index, 'polygon_string'] = polygon_string
    else:
        df.loc[index, 'polygon_string'] = f"Polygon points not found for {mall_name}"

In [None]:
df.to_csv('metro_manila_malls.csv', sep=',', index=False, encoding='utf-8')