In [115]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.parse
import pandas as pd
import re

In [116]:
def get_long_lat(location, kabupaten):
    import time
    # Gabungkan location dan kabupaten untuk query lebih presisi
    search_query = f"{location} {kabupaten}"
    encoded_location = urllib.parse.quote(search_query)

    driver.get("https://www.google.com/maps/search/{}".format(encoded_location))
    driver.implicitly_wait(3)
    
    # Get the initial page source
    initial_page_source = driver.page_source
    
    # Wait for the page source to change with a timeout of 10 seconds
    wait = WebDriverWait(driver, 10)
    wait.until(lambda driver: driver.page_source != initial_page_source)

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # CASE 1: Ditemukan list banyak hasil pencarian
    if soup.find('h1', class_="fontTitleLarge IFMGgb"):
        print("Case 1: Multiple results ditemukan")
        
        # Ambil link dari hasil pencarian pertama
        link_element = soup.find('a', class_='hfpxzc')
        if link_element:
            url = link_element.get('href')
            print(f"Link: {url}")
            driver.get(url)  # Buka link
            
            # Tunggu page source berubah setelah klik
            initial_page_source = driver.page_source
            wait = WebDriverWait(driver, 10)
            wait.until(lambda driver: driver.page_source != initial_page_source)
            
            # Tunggu URL berubah menjadi format yang stabil (dengan /@lat,lng)
            for attempt in range(10):
                current_url = driver.current_url
                if '/@' in current_url and re.search(r'/@-?\d+\.\d+,-?\d+\.\d+', current_url):
                    print(f"URL stabil ditemukan pada attempt {attempt}")
                    break
                if attempt < 9:
                    time.sleep(1)
            
            # Ambil soup dari halaman setelah klik
            html_after_click = driver.page_source
            soup_after_click = BeautifulSoup(html_after_click, "html.parser")
            
            # Ambil nama Google dari h1 dengan class "DUwDvf lfPIob"
            h1_element = soup_after_click.find('h1', class_='DUwDvf lfPIob')
            if h1_element:
                nama_google = h1_element.get_text(strip=True)
                print(f"Nama Google: {nama_google}")
            else:
                nama_google = None
                print("Nama Google tidak ditemukan")
            
            # Extract koordinat dari URL
            current_url = driver.current_url
            print(f"Final URL: {current_url}")
            
            lat_lng_match = re.search(r'/@(-?\d+\.\d+),(-?\d+\.\d+)', current_url)
            if lat_lng_match:
                latitude = float(lat_lng_match.group(1))
                longitude = float(lat_lng_match.group(2))
                print(f"Latitude: {latitude}, Longitude: {longitude}")
                return [latitude, longitude, nama_google, current_url, 0]
            else:
                print("Koordinat tidak ditemukan di URL")
                return [None, None, nama_google, current_url, 1]
        else:
            print("Link tidak ditemukan")
            return [None, None, None, None, 1]
    
    # CASE 2: Ditemukan tepat 1 hasil pencarian langsung
    elif soup.find('h1', class_='DUwDvf lfPIob'):
        print("Case 2: 1 hasil ditemukan langsung")
        
        # Tunggu URL berubah menjadi format yang stabil (dengan /@lat,lng)
        for attempt in range(10):
            current_url = driver.current_url
            if '/@' in current_url and re.search(r'/@-?\d+\.\d+,-?\d+\.\d+', current_url):
                print(f"URL stabil ditemukan pada attempt {attempt}")
                break
            if attempt < 9:
                time.sleep(1)
        
        # Ambil nama Google
        h1_element = soup.find('h1', class_='DUwDvf lfPIob')
        if h1_element:
            nama_google = h1_element.get_text(strip=True)
            print(f"Nama Google: {nama_google}")
        else:
            nama_google = None
            print("Nama Google tidak ditemukan")
        
        # Extract koordinat dari URL
        current_url = driver.current_url
        print(f"Final URL: {current_url}")
        
        lat_lng_match = re.search(r'/@(-?\d+\.\d+),(-?\d+\.\d+)', current_url)
        if lat_lng_match:
            latitude = float(lat_lng_match.group(1))
            longitude = float(lat_lng_match.group(2))
            print(f"Latitude: {latitude}, Longitude: {longitude}")
            return [latitude, longitude, nama_google, current_url, 0]
        else:
            print("Koordinat tidak ditemukan di URL")
            return [None, None, nama_google, current_url, 1]
    
    # CASE 3: Tidak ditemukan
    else:
        print("Case 3: Lokasi tidak ditemukan")
        return [None, None, None, None, 1]

In [117]:
file_input = 'Flat_sekolah_terdampak.csv'

df = pd.read_csv('input/{}'.format(file_input), sep=';')

if 'latitude' not in df.columns:
    df['latitude'] = ''

if 'longitude' not in df.columns:
    df['longitude'] = ''

if 'nama_google' not in df.columns:
    df['nama_google'] = ''

if 'url' not in df.columns:
    df['url'] = ''

In [118]:
df_na = df.loc[df.latitude == '']
df_no_na = df.loc[df.latitude != '']
df_na['isna'] = 1
df_no_na['isna'] = 0

In [119]:
df_na.head(5)

Unnamed: 0,npsn_isian,nama_isian,Jenjang,alamat_jalan,nama_dusun,desa_kelurahan,kecamatan,kabupaten,provinsi,lintang,bujur,latitude,longitude,nama_google,url,isna
0,69775026,AL0RIFDA,PAUD Sederajat,"PERUMNAS BRANDAN PERMAI, TANGKAHAN DURIAN",-,Tangkahan Durian,Kec. Berandan Barat,Kab. Langkat,Prov. Sumatera Utara,0.000000000000,0.000000000000,,,,,1
1,69790018,IKRA,PAUD Sederajat,BATANG DURIAN NO 1,-,PEKAN TANJUNG PURA,Kec. Tanjung Pura,Kab. Langkat,Prov. Sumatera Utara,-5.443.757.000.000,114.421.127.000.000,,,,,1
2,69920042,KB AISYIYAH RAUDHATUL ATHFAL,PAUD Sederajat,Jl. Teungku Umar,Teuku Umar,Lancang Garam,Kec. Banda Sakti,Kota Lhokseumawe,Prov. Aceh,5.183.700.000.000,97.148.800.000.000,,,,,1
3,69812435,KB AL HAKIM,PAUD Sederajat,BUKET DINDENG,-,Buket Dindeng,Kec. Julok,Kab. Aceh Timur,Prov. Aceh,0.000000000000,0.000000000000,,,,,1
4,69932645,KB AL-BARRU,PAUD Sederajat,Gampong Lhok Merbo Kecamatan Sawang Kabupaten ...,Lhok Merbo,Lhok Meureubo,Kec. Sawang,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,1


In [120]:
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')
options.add_experimental_option("useAutomationExtension", False)
options.add_experimental_option("excludeSwitches", ["enable-automation"])

driver = webdriver.Chrome(options=options)

for i, r in tqdm(df_na.iterrows(), total=df_na.shape[0]):
    if r['isna'] == 1 :
        location_name = r['nama_isian']
        kabupaten = r['kabupaten']  # Ambil kabupaten dari dataframe
        df_na.loc[i, ['latitude', 'longitude', 'nama_google', 'url', 'isna']] = get_long_lat(location_name, kabupaten)
    
    if i == 10 :
        break

  0%|          | 1/4781 [00:01<1:23:29,  1.05s/it]

Case 3: Lokasi tidak ditemukan


  0%|          | 2/4781 [00:02<1:20:02,  1.00s/it]

Case 3: Lokasi tidak ditemukan


  0%|          | 3/4781 [00:02<1:16:07,  1.05it/s]

Case 3: Lokasi tidak ditemukan
Case 1: Multiple results ditemukan
Link: https://www.google.com/maps/place/Kb+Al-Hakim/data=!4m7!3m6!1s0x2fd4e1c3ceaa848d:0x8b61d6797db03e47!8m2!3d-0.6050907!4d100.1958929!16s%2Fg%2F11j2dpg391!19sChIJjYSqzsPh1C8RRz6wfXnWYYs?authuser=0&hl=en&rclk=1


  0%|          | 4/4781 [00:07<3:16:27,  2.47s/it]

URL stabil ditemukan pada attempt 3
Nama Google: Kb Al-Hakim
Final URL: https://www.google.com/maps/place/Kb+Al-Hakim/@-0.6050853,100.193318,17z/data=!3m1!4b1!4m6!3m5!1s0x2fd4e1c3ceaa848d:0x8b61d6797db03e47!8m2!3d-0.6050907!4d100.1958929!16s%2Fg%2F11j2dpg391?authuser=0&hl=en&entry=ttu&g_ep=EgoyMDI2MDIwOC4wIKXMDSoASAFQAw%3D%3D
Latitude: -0.6050853, Longitude: 100.193318


  0%|          | 5/4781 [00:08<2:27:48,  1.86s/it]

Case 3: Lokasi tidak ditemukan


  0%|          | 6/4781 [00:09<1:58:17,  1.49s/it]

Case 3: Lokasi tidak ditemukan
Case 2: 1 hasil ditemukan langsung


  0%|          | 7/4781 [00:13<3:04:02,  2.31s/it]

URL stabil ditemukan pada attempt 3
Nama Google: KB AL-MUSDAR MEURAKSA
Final URL: https://www.google.com/maps/place/KB+AL-MUSDAR+MEURAKSA/@5.1395508,97.1669859,17z/data=!3m1!4b1!4m6!3m5!1s0x3047831ef12e47db:0x6e142d8bd4f6ee6f!8m2!3d5.1395508!4d97.1669859!16s%2Fg%2F11y1wq7k03?entry=ttu&g_ep=EgoyMDI2MDIwOS4wIKXMDSoASAFQAw%3D%3D
Latitude: 5.1395508, Longitude: 97.1669859


  0%|          | 8/4781 [00:14<2:26:01,  1.84s/it]

Case 3: Lokasi tidak ditemukan
Case 1: Multiple results ditemukan
Link: https://www.google.com/maps/place/Mesjid+Al-ikhlas/data=!4m7!3m6!1s0x30373cf093eea905:0xb0c4283f98db1984!8m2!3d3.9757848!4d98.2723021!16s%2Fg%2F11c329pjf9!19sChIJBanuk_A8NzARhBnbmD8oxLA?authuser=0&hl=en&rclk=1


  0%|          | 9/4781 [00:19<4:01:28,  3.04s/it]

URL stabil ditemukan pada attempt 3
Nama Google: Mesjid Al-ikhlas
Final URL: https://www.google.com/maps/place/Mesjid+Al-ikhlas/@3.9757848,98.2723021,17z/data=!3m1!4b1!4m6!3m5!1s0x30373cf093eea905:0xb0c4283f98db1984!8m2!3d3.9757848!4d98.2723021!16s%2Fg%2F11c329pjf9?authuser=0&hl=en&entry=ttu&g_ep=EgoyMDI2MDIwOC4wIKXMDSoASAFQAw%3D%3D
Latitude: 3.9757848, Longitude: 98.2723021
Case 1: Multiple results ditemukan
Link: https://www.google.com/maps/place/PAUD+Rumah+Ceria/data=!4m7!3m6!1s0x3030db79d7d339ab:0xab1acf57e09f92!8m2!3d3.5266917!4d98.348156!16s%2Fg%2F11gh_npy3f!19sChIJqznT13nbMDARkp_gV88aqwA?authuser=0&hl=en&rclk=1


  0%|          | 10/4781 [00:24<4:40:05,  3.52s/it]

URL stabil ditemukan pada attempt 3
Nama Google: PAUD Rumah Ceria
Final URL: https://www.google.com/maps/place/PAUD+Rumah+Ceria/@3.5266917,98.348156,17z/data=!3m1!4b1!4m6!3m5!1s0x3030db79d7d339ab:0xab1acf57e09f92!8m2!3d3.5266917!4d98.348156!16s%2Fg%2F11gh_npy3f?authuser=0&hl=en&entry=ttu&g_ep=EgoyMDI2MDIwOC4wIKXMDSoASAFQAw%3D%3D
Latitude: 3.5266917, Longitude: 98.348156
Case 2: 1 hasil ditemukan langsung


  0%|          | 10/4781 [00:28<3:44:31,  2.82s/it]

URL stabil ditemukan pada attempt 3
Nama Google: Sekolah Paud harsipa
Final URL: https://www.google.com/maps/place/Sekolah+Paud+harsipa/@4.1908871,98.2424506,17z/data=!3m1!4b1!4m6!3m5!1s0x30370ff714c4f64f:0x9401e49e07ba6a02!8m2!3d4.1908871!4d98.2424506!16s%2Fg%2F11kx3hnjkn?entry=ttu&g_ep=EgoyMDI2MDIwOC4wIKXMDSoASAFQAw%3D%3D
Latitude: 4.1908871, Longitude: 98.2424506





In [123]:
df_na.head()

Unnamed: 0,npsn_isian,nama_isian,Jenjang,alamat_jalan,nama_dusun,desa_kelurahan,kecamatan,kabupaten,provinsi,lintang,bujur,latitude,longitude,nama_google,url,isna
0,69775026,AL0RIFDA,PAUD Sederajat,"PERUMNAS BRANDAN PERMAI, TANGKAHAN DURIAN",-,Tangkahan Durian,Kec. Berandan Barat,Kab. Langkat,Prov. Sumatera Utara,0.000000000000,0.000000000000,,,,,1
1,69790018,IKRA,PAUD Sederajat,BATANG DURIAN NO 1,-,PEKAN TANJUNG PURA,Kec. Tanjung Pura,Kab. Langkat,Prov. Sumatera Utara,-5.443.757.000.000,114.421.127.000.000,,,,,1
2,69920042,KB AISYIYAH RAUDHATUL ATHFAL,PAUD Sederajat,Jl. Teungku Umar,Teuku Umar,Lancang Garam,Kec. Banda Sakti,Kota Lhokseumawe,Prov. Aceh,5.183.700.000.000,97.148.800.000.000,,,,,1
3,69812435,KB AL HAKIM,PAUD Sederajat,BUKET DINDENG,-,Buket Dindeng,Kec. Julok,Kab. Aceh Timur,Prov. Aceh,0.000000000000,0.000000000000,-0.605085,100.193318,Kb Al-Hakim,https://www.google.com/maps/place/Kb+Al-Hakim/...,0
4,69932645,KB AL-BARRU,PAUD Sederajat,Gampong Lhok Merbo Kecamatan Sawang Kabupaten ...,Lhok Merbo,Lhok Meureubo,Kec. Sawang,Kab. Aceh Utara,Prov. Aceh,0.000000000000,0.000000000000,,,,,1


In [122]:
# Gabungkan data yang sudah diproses
x = pd.concat([df_na, df_no_na])
x = x.sort_index()

# Hapus kolom 'isna' sebelum menyimpan (opsional, jika ingin)
# x = x.drop('isna', axis=1)

# Simpan ke file output
output_file = file_input
x.to_csv("output/{}".format(output_file), sep=';', index=False)
print(f"Data berhasil disimpan ke output/{output_file}")
print(f"Total records: {len(x)}")
print(f"Records dengan data: {len(x[x['latitude'] != ''])}")
print(f"Records tanpa data: {len(x[x['latitude'] == ''])}")

Data berhasil disimpan ke output/Flat_sekolah_terdampak.csv
Total records: 4781
Records dengan data: 11
Records tanpa data: 4770
