In [1]:
import os
import glob
import pandas as pd
import unicodedata
import re
from bs4 import BeautifulSoup
from tqdm import tqdm  

landmark_folder = "C:/Users/Ignacio/IronHackCodes/gitHStuff/FinalProj/project-aieng-interactive-travel-planner-master/data/landmarks"

#Get all text files in the folder
text_files = glob.glob(os.path.join(landmark_folder, "*.txt"))
print(f"Found {len(text_files)} files in {landmark_folder}")

Found 574 files in C:/Users/Ignacio/IronHackCodes/gitHStuff/FinalProj/project-aieng-interactive-travel-planner-master/data/landmarks


In [3]:
#Function to clean text encoding issues
def clean_text(text):
    if isinstance(text, str):  # Only process strings
        #Fix encoding issues
        text = text.encode("latin1", "ignore").decode("utf-8", "ignore")

        #Normalize Unicode formatting
        text = unicodedata.normalize("NFKC", text)

        #Remove known encoding artifacts for Spanish characters
        text = re.sub(r"\\xc3\\xa1", "á", text)  # Fix `á`
        text = re.sub(r"\\xc3\\xa9", "é", text)  # Fix `é`
        text = re.sub(r"\\xc3\\xad", "í", text)  # Fix `í`
        text = re.sub(r"\\xc3\\xb3", "ó", text)  # Fix `ó`
        text = re.sub(r"\\xc3\\xba", "ú", text)  # Fix `ú`
        text = re.sub(r"\\xc3\\xb1", "ñ", text)  # Fix `ñ`

        #Remove extra spaces and newlines
        text = re.sub(r"\s+", " ", text).strip()

        return text
    return text


In [5]:
def extract_landmark_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract title
    title = soup.find("title").get_text(strip=True).replace(" - Wikipedia", "") if soup.find("title") else os.path.basename(file_path).replace(".txt", "")

    # Extract summary
    summary_paragraphs = soup.find_all("p")
    summary = " ".join([p.get_text(strip=True) for p in summary_paragraphs[:3]]) if summary_paragraphs else "No summary available."
    summary = re.sub(r"\s+", " ", summary).strip()  # Remove excess spaces/newlines

    # Extract coordinates
    coords_meta = soup.find("span", {"class": "geo-dec"})
    lat, lon = "Unknown", "Unknown"

    if coords_meta:
        coords_text = coords_meta.get_text(strip=True)
        #print(f"Extracted Raw Coordinates (Before Cleanup): {repr(coords_text)}")  # Debugging step

        # Step 1: Replace problematic encoding artifacts
        coords_text = coords_text.replace("\xa0", " ")  # Remove non-breaking spaces
        coords_text = coords_text.replace("\\xc2\\xb0", "°")  # Fix degree encoding artifacts

        # Step 2: Extract numbers and direction indicators separately
        match = re.findall(r"([-+]?\d*\.\d+)\s*[°]?\s*([NSEW])?", coords_text)

        if match and len(match) == 2:
            # Extract the numbers and direction indicators
            lat_value, lat_dir = match[0]
            lon_value, lon_dir = match[1]

            # Step 3: Convert to correct signed float based on N/S/E/W
            lat = float(lat_value) * (-1 if lat_dir == "S" else 1)
            lon = float(lon_value) * (-1 if lon_dir == "W" else 1)

        else:
            lat, lon = "Unknown", "Unknown"

    # If Latitude or Longitude is Unknown, return None to skip
    if lat == "Unknown" or lon == "Unknown":
        return None

    #print(f"Coordinates (Cleaned): {lat}, {lon}")  # Debugging step

    # Store simplified data in dictionary
    return {
        "Title": clean_text(title),
        "Summary": clean_text(summary),
        "Latitude": lat,
        "Longitude": lon
    }

In [7]:
# Batch process all files
landmark_data_list = []

for file_path in tqdm(text_files, desc="Processing Landmarks", unit="file"):
    data = extract_landmark_data(file_path)
    if data:  # Only add entries that are not None
        landmark_data_list.append(data)

# Convert list of dictionaries to Pandas DataFrame
df_landmarks = pd.DataFrame(landmark_data_list)

# Display the final DataFrame
from IPython.display import display
display(df_landmarks)

Processing Landmarks: 100%|██████████| 574/574 [00:22<00:00, 26.09file/s]


Unnamed: 0,Title,Summary,Latitude,Longitude
0,Academia del Perpetuo Socorro,Mrs. Jeannette Sánchez (1-6)\n Academia del Pe...,18.454440,-66.084720
1,Academia Interamericana Metro,TheAcademia Interamericana Metro(beforeAcademi...,18.448531,-66.072122
2,Academia Maria Reina,Academia Maria Reinais a Catholic middle (7th ...,18.383442,-66.085516
3,Academia San Jorge,"Academia San Jorge(""Saint George Academy"") is ...",18.450560,-66.061670
4,Adjuntas barrio-pueblo,Adjuntas barrio-pueblois abarrioand the admini...,18.163776,-66.723544
...,...,...,...,...
515,Villa Pesquera,\n Villa Pesquerais a sea-front fishing villag...,17.981900,-66.622890
516,William Miranda Marín Botanical and Cultural G...,\n TheWilliam Miranda Marín Botanical and Cult...,18.241390,-66.061670
517,Yabucoa barrio-pueblo,Yabucoa barrio-pueblois abarrioand the adminis...,18.047304,-65.880083
518,Yauco barrio-pueblo,Yauco barrio-pueblois abarrioand the administr...,18.036342,-66.849470


In [9]:
# Save the DataFrame to a CSV file
output_csv = "processed_landmarks_final.csv"
df_landmarks.to_csv(output_csv, index=False, encoding="utf-8")
print(f"Data saved to {output_csv}")

✅ Data saved to processed_landmarks_final.csv
