In [7]:
import geopandas as gpd
import requests as r
import pandas as pd
import seaborn as sns
import pyarrow.parquet as pq
from geopy.geocoders import Nominatim
import time

url = "https://storage.dosm.gov.my/hies/hies_district.parquet"

In [8]:
hies_df = pd.read_parquet(url)

In [13]:
hies_df
hies_df['district'] = hies_df['district'].replace('W.P. Kuala Lumpur', 'Kuala Lumpur')
hies_df['state'] = hies_df['state'].replace('W.P. Kuala Lumpur', 'Kuala Lumpur')


In [14]:
geolocator = Nominatim(user_agent="my-gis-application")

In [15]:
unique_districts = hies_df['district'].unique()

In [17]:
coord_dict = {}

for district in unique_districts:
    try:
        # We add ", Malaysia" to make the search more specific
        location = geolocator.geocode(district + ", Malaysia")
        if location:
            coord_dict[district] = (location.latitude, location.longitude)
            print(f"Found {district}: ({location.latitude}, {location.longitude})")
        else:
            coord_dict[district] = (None, None)
            print(f"Could not find coordinates for {district}")
        
        # # Be nice to the server: wait 1 second between requests
        # time.sleep(1)
        
    except Exception as e:
        print(f"An error occurred for {district}: {e}")
        coord_dict[district] = (None, None)

Found Batu Pahat: (1.933333, 103.0)
Found Johor Bahru: (1.4581986, 103.7649059)
Found Kluang: (2.0323472, 103.3190766)
Found Kota Tinggi: (1.7336743, 103.9006975)
Found Mersing: (2.4298532, 103.835494)
Found Muar: (2.0425046, 102.5658852)
Found Pontian: (1.5, 103.5)
Found Segamat: (2.4922852, 102.8450367)
Found Kulai: (1.6666667, 103.6)
Found Tangkak: (2.25, 102.5833333)
Found Baling: (5.5507199, 100.797587)
Found Bandar Baharu: (5.1672991, 100.5849347)
Found Kota Setar: (6.122603, 100.3690317)
Found Kuala Muda: (5.7073152, 100.4952892)
Found Kubang Pasu: (6.3356376, 100.3723218)
Found Kulim: (5.3707128, 100.6172071)
Found Langkawi: (6.3700386, 99.7928634)
Found Padang Terap: (6.2566513, 100.6645805)
Found Sik: (5.9239011, 100.8231021)
Found Yan: (5.8482755, 100.4151899)
Found Pendang: (5.9720841, 100.5501852)
Found Pokok Sena: (6.1730525, 100.5203915)
Found Bachok: (5.9951191, 102.3867413)
Found Kota Bharu: (6.1247911, 102.2378065)
Found Machang: (5.7676702, 102.2380822)
Found Pasir M

In [18]:
hies_df['latitude'] = hies_df['district'].map(lambda s: coord_dict.get(s, (None, None))[0])
hies_df['longitude'] = hies_df['district'].map(lambda s: coord_dict.get(s, (None, None))[1])


In [19]:
hies_df

Unnamed: 0,date,state,district,income_mean,income_median,expenditure_mean,gini,poverty,latitude,longitude
0,2022-01-01,Johor,Batu Pahat,7419,6347,4570,0.33830,5.1,1.933333,103.000000
1,2022-01-01,Johor,Johor Bahru,9869,8232,6139,0.35850,3.7,1.458199,103.764906
2,2022-01-01,Johor,Kluang,6461,5204,4399,0.35367,7.2,2.032347,103.319077
3,2022-01-01,Johor,Kota Tinggi,7529,6227,4498,0.34335,5.0,1.733674,103.900698
4,2022-01-01,Johor,Mersing,5426,4445,4032,0.33877,12.7,2.429853,103.835494
...,...,...,...,...,...,...,...,...,...,...
155,2022-01-01,Sarawak,Beluru,4873,4164,2386,0.30156,13.7,3.551515,114.129055
156,2022-01-01,Sarawak,Telang Usan,4692,3870,2578,0.31077,6.0,3.317232,114.825832
157,2022-01-01,Kuala Lumpur,Kuala Lumpur,13325,10234,7823,0.37960,1.4,3.152659,101.702220
158,2022-01-01,W.P. Labuan,W.P. Labuan,8250,6904,4176,0.30028,2.5,5.275699,115.232815


In [20]:
gdf = gpd.GeoDataFrame(
    hies_df,
    geometry=gpd.points_from_xy(hies_df['longitude'], hies_df['latitude']),
    crs="EPSG:4326"
)
gdf.to_file('hies_geocoded_auto.gpkg', driver='GPKG')

