# Objective

    The ultimate objective is to extract, with exhaustive precision, detailed locality 
    information from inputs that may be vexingly ambiguous, by deftly harnessing web 
    searches in concert with large language models. In so doing, we aim to pinpoint 
    and assemble, with unerring accuracy, the pertinent particulars—District, City, 
    Country, and Continent alike.

# Setup

In [2]:
"""
Initialize and load all required packages, then configure and instantiate 
a WebDriver for seamless browser automation. 
"""

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import json
import browser_cookie3
import ollama
import pandas as pd
import torch
import numpy as np
# Set up Chrome options to mimic a real browser
chrome_options = Options()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
chrome_options.add_argument(f'--user-agent={user_agent}')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')

# Initialize driver
driver = webdriver.Chrome(options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")


# Preparing the Dataset

    Through a careful scrutiny of the raw data, we have been able to detect and excise 
    duplicate locality entries, thereby streamlining the dataset considerably. This 
    judicious pruning substantially lightens the computational burden placed upon the 
    large language model —a process that is, by its very nature, both prodigiously 
    time-consuming and voracious in its appetite for resources.

In [3]:
TA = pd.read_csv('TAPosts_2023.csv') # read the dataframe
print(TA)

      Chain ID Counter  Post in Chain Counter   chainid         id  \
0                    1                      1  13263599  105939910   
1                    1                      2  13263599  105946408   
2                    1                      3  13263599  105946471   
3                    1                      4  13263599  105946513   
4                    1                      5  13263599  105947215   
...                ...                    ...       ...        ...   
5214               711                      9  14620570  119601299   
5215               711                     10  14620570  119601438   
5216               711                     11  14620570  119602937   
5217               712                      1  14621094  119601952   
5218               712                      2  14621094  119602952   

                  locality    longname     postdate postings  \
0                Boston...  TA_Colleen  4 years ago      176   
1         Hong Kong, China     

In [4]:
locality = TA['locality'] 
locality_Cleaned = locality.drop_duplicates().dropna() # Drop all the null and duplicates
locality_Cleaned = locality_Cleaned.reset_index(drop=True)
print(locality_Cleaned)

0                   Boston...
1            Hong Kong, China
2           Apeldoorn, The...
3      New Forest National...
4           Chicago, Illinois
                ...          
262    Skopje, Republic of...
263       Fremont, California
264               Agra, India
265             Kihei, Hawaii
266       Surbiton, United...
Name: locality, Length: 267, dtype: object


    We have now distilled the dataset to a mere 267 unique entries – a most gratifying reduction 
    from the original 5,219 – thereby affording markedly swifter and more efficient processing 
    henceforth.

    We shall now compile a comprehensive register of every sovereign state upon this Earth 
    – specifically, the 195 nations recognised by the United Nations – thereby enabling the 
    large language model to confine its responses strictly to these entities, each of which 
    possesses a distinct and unambiguous official name. We naturally hope that the model will 
    faithfully adhere to this authoritative list; however, inasmuch as its outputs are ultimately 
    governed by probabilistic sampling rather than rigid determinism, one cannot say with 
    absolute certainty how reliably effective this constraint will prove in practice.

In [5]:
country_raw = pd.read_csv('countries.csv' ) # read the dataframe
country = country_raw.drop(['id','alpha2','alpha3'],axis=1 )
country = country['name'].to_numpy()
country = ",".join(country.astype(str))
country

"Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia, Plurinational State of,Bosnia and Herzegovina,Botswana,Brazil,Brunei Darussalam,Bulgaria,Burkina Faso,Burundi,Cabo Verde,Cambodia,Cameroon,Canada,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Congo, Democratic Republic of the,Costa Rica,Côte d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Eritrea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hungary,Iceland,India,Indonesia,Iran, Islamic Republic of,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Korea, Democratic People's Republic of,Korea, Republic of,Kuwait,Kyrgyzstan,Lao People's Democratic Republic,Latvia,Lebanon,Lesotho,L

In [6]:
Locality_mapping = pd.DataFrame(columns=['District', 'City', 'Country', 'Continent' ,'locality' ])

In [7]:
lower_bound = 0
upper_bound = len(locality_Cleaned)



while lower_bound < upper_bound:
    
    location = locality_Cleaned[lower_bound]
    
    # Scrape page
    #url = f"https://www.google.com/maps/place/{location}?hl=en"
    url = f"https://www.google.com/search?q={location}&lr=lang_en"
    selector = "body"
   
    
    driver.get(url)
    if selector:
        wait = WebDriverWait(driver, 2)
        element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
        extracted_text = element.text
    else:
        extracted_text = None
    
    #print(extracted_text)

    # Prepare RAG prompt for Ollama
    prompt = f"""
        You are a precise geographical data extractor. Your task is to analyze the provided web search results and your 
        internal knowledge about the location '{location}' and return EXACTLY ONE valid JSON object with the following four keys:

        - "District": the smallest administrative division (e.g., county, borough, district, arrondissement). Use "NaN" if the location is a country or if no district-level division exists or can be determined.
        - "City": the city or municipality name. Use "NaN" if the location is a country or if it refers to a region/district rather than a specific city.
        - "Country": the full official English country name. You MUST choose the country name exclusively from the provided reference list below. Never invent or use a different spelling.
        - "Continent": one of ["Africa", "Antarctica", "Asia", "Australia", "Europe", "North America", "South America"]

        Reference country list (use exactly one of these strings for the "Country" field):
        {country}

        Rules:
        - If '{location}' is itself a country that appears in the reference list, set "District" = "NaN" and "City" = "NaN".
        - If the location is ambiguous, prioritize the most commonly known entity with that exact name.
        - Never return null, empty strings, or missing keys.
        - If information is not available or cannot be confidently determined, use "Unknown".
        - Output NOTHING except the JSON object. No explanations, no markdown, no ```json markers, no extra whitespace or newlines before/after the JSON.

        Context (web search results ):
        {extracted_text}

        Return only the JSON:
        """

    # Use Ollama to perform RAG/generation
    response = ollama.generate(model='gpt-oss:latest', prompt=prompt)
    generated_json = response['response'].strip()
    #For some Advanced Model      
    #result = generated_json.split("</think>", 1)[1]
    result = generated_json

        
    try:
        data = json.loads(result)
        flag = True  
    except (json.JSONDecodeError, TypeError):
        flag = False

    if flag == True:
        # Print the JSON
        print(location + "========>" + result)
        data["locality"] = location
        # Convert JSON to DataFrame
        row = pd.DataFrame([data])
        Locality_mapping = pd.concat([Locality_mapping, row], ignore_index=True)
        lower_bound = lower_bound + 1
    
    del response
    torch.cuda.empty_cache()



In [10]:
Locality_mapping = Locality_mapping.drop_duplicates(subset=['locality'])
Locality_mapping 

Unnamed: 0,District,City,Country,Continent,locality
0,Suffolk County,Boston,United States of America,North America,Boston...
1,,Hong Kong,China,Asia,"Hong Kong, China"
2,Apeldoorn,Apeldoorn,Netherlands,Europe,"Apeldoorn, The..."
3,Hampshire,,United Kingdom of Great Britain and Northern I...,Europe,New Forest National...
4,Cook County,Chicago,United States of America,North America,"Chicago, Illinois"
...,...,...,...,...,...
262,,Skopje,North Macedonia,Europe,"Skopje, Republic of..."
263,Alameda County,Fremont,United States of America,North America,"Fremont, California"
264,Agra,Agra,India,Asia,"Agra, India"
265,Maui County,,United States of America,North America,"Kihei, Hawaii"


In [12]:
Locality_mapping.to_csv('Locality_mapping.csv', index=False)