In [4]:
# ! pip install selenium bs4 pandas tqdm

In [5]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.parse
import pandas as pd
import re

In [6]:
def get_long_lat(location, kabupaten):
    import time
    # Gabungkan location dan kabupaten untuk query lebih presisi
    search_query = f"{location} {kabupaten}"
    encoded_location = urllib.parse.quote(search_query)

    driver.get("https://www.google.com/maps/search/{}".format(encoded_location))
    driver.implicitly_wait(3)
    
    # Get the initial page source
    initial_page_source = driver.page_source
    
    # Wait for the page source to change with a timeout of 10 seconds
    wait = WebDriverWait(driver, 10)
    wait.until(lambda driver: driver.page_source != initial_page_source)

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # CASE 1: Ditemukan list banyak hasil pencarian
    if soup.find('h1', class_="fontTitleLarge IFMGgb"):
        print("Case 1: Multiple results ditemukan")
        
        # Ambil link dari hasil pencarian pertama
        link_element = soup.find('a', class_='hfpxzc')
        if link_element:
            url = link_element.get('href')
            print(f"Link: {url}")
            driver.get(url)  # Buka link
            
            # Tunggu page source berubah setelah klik
            initial_page_source = driver.page_source
            wait = WebDriverWait(driver, 10)
            wait.until(lambda driver: driver.page_source != initial_page_source)
            
            # Tunggu URL berubah menjadi format yang stabil (dengan /@lat,lng)
            for attempt in range(10):
                current_url = driver.current_url
                if '/@' in current_url and re.search(r'/@-?\d+\.\d+,-?\d+\.\d+', current_url):
                    print(f"URL stabil ditemukan pada attempt {attempt}")
                    break
                if attempt < 9:
                    time.sleep(1)
            
            # Ambil soup dari halaman setelah klik
            html_after_click = driver.page_source
            soup_after_click = BeautifulSoup(html_after_click, "html.parser")
            
            # Ambil nama Google dari h1 dengan class "DUwDvf lfPIob"
            h1_element = soup_after_click.find('h1', class_='DUwDvf lfPIob')
            if h1_element:
                nama_google = h1_element.get_text(strip=True)
                print(f"Nama Google: {nama_google}")
            else:
                nama_google = None
                print("Nama Google tidak ditemukan")
            
            # Extract koordinat dari URL
            current_url = driver.current_url
            print(f"Final URL: {current_url}")
            
            lat_lng_match = re.search(r'/@(-?\d+\.\d+),(-?\d+\.\d+)', current_url)
            if lat_lng_match:
                latitude = float(lat_lng_match.group(1))
                longitude = float(lat_lng_match.group(2))
                print(f"Latitude: {latitude}, Longitude: {longitude}")
                return [latitude, longitude, nama_google, current_url, 0]
            else:
                print("Koordinat tidak ditemukan di URL")
                return [None, None, nama_google, current_url, 1]
        else:
            print("Link tidak ditemukan")
            return [None, None, None, None, 1]
    
    # CASE 2: Ditemukan tepat 1 hasil pencarian langsung
    elif soup.find('h1', class_='DUwDvf lfPIob'):
        print("Case 2: 1 hasil ditemukan langsung")
        
        # Tunggu URL berubah menjadi format yang stabil (dengan /@lat,lng)
        for attempt in range(10):
            current_url = driver.current_url
            if '/@' in current_url and re.search(r'/@-?\d+\.\d+,-?\d+\.\d+', current_url):
                print(f"URL stabil ditemukan pada attempt {attempt}")
                break
            if attempt < 9:
                time.sleep(1)
        
        # Ambil nama Google
        h1_element = soup.find('h1', class_='DUwDvf lfPIob')
        if h1_element:
            nama_google = h1_element.get_text(strip=True)
            print(f"Nama Google: {nama_google}")
        else:
            nama_google = None
            print("Nama Google tidak ditemukan")
        
        # Extract koordinat dari URL
        current_url = driver.current_url
        print(f"Final URL: {current_url}")
        
        lat_lng_match = re.search(r'/@(-?\d+\.\d+),(-?\d+\.\d+)', current_url)
        if lat_lng_match:
            latitude = float(lat_lng_match.group(1))
            longitude = float(lat_lng_match.group(2))
            print(f"Latitude: {latitude}, Longitude: {longitude}")
            return [latitude, longitude, nama_google, current_url, 0]
        else:
            print("Koordinat tidak ditemukan di URL")
            return [None, None, nama_google, current_url, 1]
    
    # CASE 3: Tidak ditemukan
    else:
        print("Case 3: Lokasi tidak ditemukan")
        return [None, None, None, None, 2]

In [7]:
file_input = 'Flat_sekolah_terdampak.csv'

df = pd.read_csv('input/{}'.format(file_input), sep=';')

if 'latitude' not in df.columns:
    df['latitude'] = ''

if 'longitude' not in df.columns:
    df['longitude'] = ''

if 'nama_google' not in df.columns:
    df['nama_google'] = ''

if 'url' not in df.columns:
    df['url'] = ''

In [8]:
df_na = df.loc[df['latitude'].isna() | (df['latitude'] == '')]
df_no_na = df.loc[~(df['latitude'].isna() | (df['latitude'] == ''))]

# Untuk df_na: set isna=1 kecuali sudah 2 (tidak ditemukan sebelumnya)
df_na['isna'] = df_na['isna'].apply(lambda x: 1 if (pd.isna(x) or x != 2) else 2)
# Untuk df_no_na: set isna=0 (ditemukan)
df_no_na['isna'] = 0

In [None]:
df_na.head(5)

Unnamed: 0,npsn_isian,nama_isian,Jenjang,alamat_jalan,nama_dusun,desa_kelurahan,kecamatan,kabupaten,provinsi,lintang,bujur,latitude,longitude,nama_google,url,isna
0,69775026,AL0RIFDA,PAUD Sederajat,"PERUMNAS BRANDAN PERMAI, TANGKAHAN DURIAN",-,Tangkahan Durian,Kec. Berandan Barat,Kab. Langkat,Prov. Sumatera Utara,0.000000000000,0.000000000000,,,,,2
1,69790018,IKRA,PAUD Sederajat,BATANG DURIAN NO 1,-,PEKAN TANJUNG PURA,Kec. Tanjung Pura,Kab. Langkat,Prov. Sumatera Utara,-5.443.757.000.000,114.421.127.000.000,,,,,2
4,69932645,KB AL-BARRU,PAUD Sederajat,Gampong Lhok Merbo Kecamatan Sawang Kabupaten ...,Lhok Merbo,Lhok Meureubo,Kec. Sawang,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,2
7,69967119,KB AR-RIDHA,PAUD Sederajat,Gampong Cot Ara Kecamatan Baktiya Kabupaten Ac...,Cot Ara,Cot Ara,Kec. Baktiya,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,2
12,69798169,KB AL-AMIN,PAUD Sederajat,Jalan Irigasi Gampong Kayee Panyang Kecamatan ...,Baroh,Kayee Panyang,Kec. Syamtalira Bayu,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,2


In [9]:
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')
options.add_experimental_option("useAutomationExtension", False)
options.add_experimental_option("excludeSwitches", ["enable-automation"])

driver = webdriver.Chrome(options=options)

for i, r in tqdm(df_na.iterrows(), total=df_na.shape[0]):
    if r['isna'] == 1 :
        location_name = r['nama_isian']
        kabupaten = r['kabupaten']  # Ambil kabupaten dari dataframe
        df_na.loc[i, ['latitude', 'longitude', 'nama_google', 'url', 'isna']] = get_long_lat(location_name, kabupaten)

  4%|▎         | 156/4359 [00:07<03:15, 21.45it/s]

Case 3: Lokasi tidak ditemukan
Case 3: Lokasi tidak ditemukan
Case 3: Lokasi tidak ditemukan
Case 1: Multiple results ditemukan
Link: https://www.google.com/maps/place/Paud+Immanuel+Desa+Pearung+Silali/data=!4m7!3m6!1s0x302e19d52a42a289:0xaf52d4847023a56a!8m2!3d2.3107623!4d98.8958575!16s%2Fg%2F11fjtylwpw!19sChIJiaJCKtUZLjARaqUjcITUUq8?authuser=0&hl=id&rclk=1


  4%|▎         | 159/4359 [00:22<12:31,  5.59it/s]

URL stabil ditemukan pada attempt 2
Nama Google: Paud Immanuel Desa Pearung Silali
Final URL: https://www.google.com/maps/place/Paud+Immanuel+Desa+Pearung+Silali/@2.3107623,98.8958575,17z/data=!3m1!4b1!4m6!3m5!1s0x302e19d52a42a289:0xaf52d4847023a56a!8m2!3d2.3107623!4d98.8958575!16s%2Fg%2F11fjtylwpw?authuser=0&hl=id&entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 2.3107623, Longitude: 98.8958575
Case 2: 1 hasil ditemukan langsung


  4%|▎         | 160/4359 [00:31<20:29,  3.42it/s]

URL stabil ditemukan pada attempt 2
Nama Google: Bina Ananda Mandiri Marelan
Final URL: https://www.google.com/maps/place/Bina+Ananda+Mandiri+Marelan/@3.6956923,98.6360008,17z/data=!3m1!4b1!4m6!3m5!1s0x30312db9b0c21765:0xc33cdda1340c532c!8m2!3d3.6956923!4d98.6360008!16s%2Fg%2F11sg2jsvcr?entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 3.6956923, Longitude: 98.6360008
Case 1: Multiple results ditemukan
Link: https://www.google.com/maps/place/Paud+Kasih+Bunda/data=!4m7!3m6!1s0x30379b42069e84e9:0x3443eac41be07919!8m2!3d4.4817221!4d97.9788956!16s%2Fg%2F11llk80tr9!19sChIJ6YSeBkKbNzARGXngG8TqQzQ?authuser=0&hl=id&rclk=1
URL stabil ditemukan pada attempt 0


  4%|▎         | 161/4359 [00:44<35:37,  1.96it/s]

Nama Google: Paud Kasih Bunda
Final URL: https://www.google.com/maps/place/Paud+Kasih+Bunda/@4.4817221,97.9788956,17z/data=!3m1!4b1!4m6!3m5!1s0x30379b42069e84e9:0x3443eac41be07919!8m2!3d4.4817221!4d97.9788956!16s%2Fg%2F11llk80tr9?authuser=0&hl=id&entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 4.4817221, Longitude: 97.9788956
Case 2: 1 hasil ditemukan langsung


  4%|▎         | 162/4359 [00:49<44:26,  1.57it/s]

URL stabil ditemukan pada attempt 3
Nama Google: PAUD CAHAYA HATE
Final URL: https://www.google.com/maps/place/PAUD+CAHAYA+HATE/@3.6138198,96.9617741,17z/data=!3m1!4b1!4m6!3m5!1s0x303bd19e3257152d:0x24a5a3eae6efd44a!8m2!3d3.6138198!4d96.9617741!16s%2Fg%2F11vbm2hbl8?entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 3.6138198, Longitude: 96.9617741


  4%|▎         | 163/4359 [00:51<46:09,  1.51it/s]

Case 3: Lokasi tidak ditemukan


  4%|▍         | 164/4359 [00:53<51:39,  1.35it/s]

Case 3: Lokasi tidak ditemukan


  4%|▍         | 165/4359 [00:54<54:10,  1.29it/s]

Case 3: Lokasi tidak ditemukan
Case 2: 1 hasil ditemukan langsung


  4%|▍         | 166/4359 [00:59<1:17:28,  1.11s/it]

URL stabil ditemukan pada attempt 3
Nama Google: PAUD/TK Grace Nauli
Final URL: https://www.google.com/maps/place/PAUD%2FTK+Grace+Nauli/@1.8232664,98.8283579,17z/data=!3m1!4b1!4m6!3m5!1s0x302e5facc1f5bad3:0xce59789e8c364dd5!8m2!3d1.8232664!4d98.8283579!16s%2Fg%2F11trgyj5rk?entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 1.8232664, Longitude: 98.8283579
Case 2: 1 hasil ditemukan langsung


  4%|▍         | 167/4359 [01:04<1:42:07,  1.46s/it]

URL stabil ditemukan pada attempt 3
Nama Google: MTs Swasta Badrul Ulum
Final URL: https://www.google.com/maps/place/MTs+Swasta+Badrul+Ulum/@3.6300231,97.7130648,17z/data=!3m1!4b1!4m6!3m5!1s0x303a014cc7bd2dfd:0x3c95914f4713299c!8m2!3d3.6300231!4d97.7130648!16s%2Fg%2F11f0_gppr8?entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 3.6300231, Longitude: 97.7130648
Case 1: Multiple results ditemukan
Link: https://www.google.com/maps/place/TK+Aisyiyah+Bustanul+Athfal+05/data=!4m7!3m6!1s0x303131b629439d31:0xb78a38c0949ff518!8m2!3d3.6177452!4d98.6779304!16s%2Fg%2F11g1b1gpm5!19sChIJMZ1DKbYxMTARGPWflMA4irc?authuser=0&hl=id&rclk=1


  4%|▍         | 167/4359 [01:07<28:22,  2.46it/s]  


TypeError: argument of type 'NoneType' is not iterable

In [10]:
df_na.head()

Unnamed: 0,npsn_isian,nama_isian,Jenjang,alamat_jalan,nama_dusun,desa_kelurahan,kecamatan,kabupaten,provinsi,lintang,bujur,latitude,longitude,nama_google,url,isna
0,69775026,AL0RIFDA,PAUD Sederajat,"PERUMNAS BRANDAN PERMAI, TANGKAHAN DURIAN",-,Tangkahan Durian,Kec. Berandan Barat,Kab. Langkat,Prov. Sumatera Utara,0.000000000000,0.000000000000,,,,,2
1,69790018,IKRA,PAUD Sederajat,BATANG DURIAN NO 1,-,PEKAN TANJUNG PURA,Kec. Tanjung Pura,Kab. Langkat,Prov. Sumatera Utara,-5.443.757.000.000,114.421.127.000.000,,,,,2
4,69932645,KB AL-BARRU,PAUD Sederajat,Gampong Lhok Merbo Kecamatan Sawang Kabupaten ...,Lhok Merbo,Lhok Meureubo,Kec. Sawang,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,2
7,69967119,KB AR-RIDHA,PAUD Sederajat,Gampong Cot Ara Kecamatan Baktiya Kabupaten Ac...,Cot Ara,Cot Ara,Kec. Baktiya,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,2
12,69798169,KB AL-AMIN,PAUD Sederajat,Jalan Irigasi Gampong Kayee Panyang Kecamatan ...,Baroh,Kayee Panyang,Kec. Syamtalira Bayu,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,2


In [11]:
# Gabungkan data yang sudah diproses
x = pd.concat([df_na, df_no_na])
x = x.sort_index()

# Hapus kolom 'isna' sebelum menyimpan (opsional, jika ingin)
# x = x.drop('isna', axis=1)

# Simpan ke file output
output_file = file_input
x.to_csv("input/{}".format(output_file), sep=';', index=False)
print(f"Data berhasil disimpan ke input/{output_file}")
print(f"Total records: {len(x)}")
print(f"Records dengan data: {len(x[x['latitude'] != ''])}")
print(f"Records tanpa data: {len(x[x['latitude'] == ''])}")

Data berhasil disimpan ke input/Flat_sekolah_terdampak.csv
Total records: 4781
Records dengan data: 4781
Records tanpa data: 0
