In [None]:
import requests
from bs4 import BeautifulSoup
import re
# Function to format polygon points as a string in the required format
def format_polygon_points(polygon_points):
    formatted_points = ", ".join([f"{lon} {lat}" for lat, lon in polygon_points])
    return f"POLYGON (({formatted_points}))"
# Function to get polygon points from OpenStreetMap within the Philippines boundary
def get_polygon_points(place_name):
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json];
    // Define the boundary for the Philippines
    area[name="Philippines"];
    // Query for the place name within the Philippines boundary
    area[name="{place_name}"](area);
    // Output the geometry of the area
    out geom;
    """
    response = requests.post(overpass_url, data=overpass_query)
    data = response.json()
    if 'elements' in data and len(data['elements']) > 0:
        element = data['elements'][0]
        if 'type' in element and element['type'] == 'way':
            # Extract polygon points
            polygon_points = [(node['lat'], node['lon']) for node in element['geometry']]
            return format_polygon_points(polygon_points)
    return None

In [None]:
# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_the_Philippines"
# Fetch the webpage content
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Find all tables containing shopping mall data except those under construction
all_tables = soup.find_all('table', class_='wikitable')
filtered_tables = [table for table in all_tables if not table.find_previous('span', class_='mw-headline', text='Shopping malls under construction')]
# Major developers list
major_developers = ["Ayala Land", "SM Prime Holdings", "Robinsons Land", "Filinvest Land", "Megaworld Corporation"]
# Function to extract data from a given table
def extract_data(table):
    max_columns = 0
    for row in table.find_all('tr'):
        columns = row.find_all(['td', 'th'])
        max_columns = max(max_columns, len(columns))
    malls_data = []
    for row in table.find_all('tr')[1:]:  # Skip the first row
        columns = row.find_all(['td', 'th'])
        if len(columns) == max_columns:
            name = re.sub(r'\[.*?\]', '', columns[0].text.strip())  # Remove characters within square brackets
            city = re.sub(r'\[.*?\]', '', columns[1].text.strip())
            developer_index = 4 if max_columns == 7 else 3  # Adjust index based on the number of columns
            developer = re.sub(r'\[.*?\]', '', columns[developer_index].text.strip()) if developer_index < len(columns) else ''
            is_major_developer = any(dev in developer for dev in major_developers)  # Check if any major developer is present
            malls_data.append((name, city, developer, "TRUE" if is_major_developer else "FALSE"))
        elif len(columns) == max_columns - 1:
            name = re.sub(r'\[.*?\]', '', columns[0].text.strip())
            city = re.sub(r'\[.*?\]', '', columns[1].text.strip())
            malls_data.append((name, city, "", "FALSE"))  # Set developer field blank and major developer as FALSE
    return malls_data


In [None]:
import pandas as pd
import numpy as np

developers = ["Ayala Land", "SM Prime Holdings", "Robinsons Land", "Filinvest Land", "Megaworld Corporation"]

df = pd.DataFrame()

# Extract data from filtered tables
all_malls_data = {}
for table in filtered_tables:
    category = table.find_previous('span', class_='mw-headline').text
    all_malls_data[category] = extract_data(table)
    
# List of mall values
mall_names = [mall[0] for malls in all_malls_data.values() for mall in malls]
mall_loc = [mall[1] for malls in all_malls_data.values() for mall in malls]
mall_dev = [mall[2] for malls in all_malls_data.values() for mall in malls]

# convert to Data Frame
df_mall_loc = pd.DataFrame(mall_loc, columns=['address_city'])
df_mall_names = pd.DataFrame(mall_names, columns=['mall_name'])
df_mall_dev = pd.DataFrame(mall_dev, columns=['developer_name'])


df = pd.concat([df_mall_loc,df_mall_names,df_mall_dev], axis=1)


# check if it is major corp
df['is_major_corp'] = np.where(df['developer_name'].isin(developers), True, False)

df
    

In [None]:
# Retrieve and print polygon strings for each mall within the Philippines boundary
polygon_points = []

mall_names = df['mall_name']

for mall_name in mall_names:
    polygon_string = get_polygon_points(mall_name)
    if polygon_string:
        polygon_points.append(polygon_string)
    else:
        polygon_points.append(None)

df['polygon'] = polygon_points

df

In [None]:
df

In [None]:
# If not found use the other method
import osmnx as ox
import geopandas as gpd

def format_polygon_points(polygon_points):
    formatted_points = ", ".join([f"{lat} {lon}" for lat, lon in polygon_points])
    return f"POLYGON (({formatted_points}))"

def get_polygon_points(mall_name):
    try:
        area = ox.geocode_to_gdf(mall_name)
        if area.empty:
            return []
        polygon_points = []
        for geometry in area.geometry:
            polygon_points.extend(geometry.exterior.coords)
        return polygon_points
    except (ox.geocoder.InsufficientResponseError, TypeError, AttributeError):
        print(f"Mall Name: {mall_name}  Polygon_points: still not found")
        return []

In [None]:
for index, row in df.iterrows():
    if pd.isnull(row['polygon']):
        mall_name = row['mall_name']
        
        # Call get_polygon_points with the mall name
        polygon_points = get_polygon_points(mall_name)
        
        if polygon_points:
            # Update the 'polygon' column in the DataFrame with the new polygon points
            df.at[index, 'polygon'] = format_polygon_points(polygon_points)
        else:
            df.at[index, 'polygon'] = None

In [None]:
# Export to normal csv
df.to_csv('provincial_malls.csv', index=False)