In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import chart_studio.plotly as py
import plotly.express as px
import requests

df = pd.read_csv('https://linked.aub.edu.lb/pkgcube/data/475ce9a5aab198af0e1c14a3cb63e4e7.csv')

In [None]:
# create Governorate column from refArea
df['Governorate'] = (
    df["refArea"]
    .str.split("/")
    .str[-1]                # take last part after /
    .str.replace("_", " ")  # replace underscores with spaces
)

# clean Miniyehâ\x80\x93Danniyeh District to Miniyeh-Danniyeh District
df['Governorate'] = df['Governorate'].replace(
    {'Miniyehâ\x80\x93Danniyeh District': 'Miniyeh-Danniyeh District'}
)

In [None]:
# Replace 'Aain' with 'Ain' in the 'Town' column to be able to get the coordinates of these towns
df['Town'] = df['Town'].str.replace('Aain', 'Ain')

In [None]:
# Get coordinates of Towns
## NOTE: this takes a lot of time to execute

from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="edu_map")

# Function to get lat/lon
def get_coordinates(town):
    try:
        location = geolocator.geocode(f"{town}, Lebanon")
        if location:
            return pd.Series([location.latitude, location.longitude])
        else:
            return pd.Series([None, None])
    except:
        return pd.Series([None, None])

# Apply to towns in df
df[["lat", "lon"]] = df["Town"].apply(get_coordinates)

In [None]:
# Function to get the population for each Governorate from dbpedia
def get_population_est(ref_area):
    """
    Fetch population estimate from DBpedia JSON for a given refArea.
    """
    url = f"https://dbpedia.org/data/{ref_area.replace(' ', '_')}.json"
    entity = f"http://dbpedia.org/resource/{ref_area.replace(' ', '_')}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure we catch HTTP errors
        data = response.json()
        
        if entity in data:
            props = data[entity]
            # Try both common population properties
            for key in ["http://dbpedia.org/property/populationEst",
                        "http://dbpedia.org/ontology/populationTotal"]:
                if key in props:
                    return props[key][0]["value"]
        return None
    except Exception as e:
        print(f"Error fetching {ref_area}: {e}")
        return None
    
df["population"] = df["Governorate"].apply(get_population_est)

In [None]:
df.to_csv('data_clean.csv')