In [1]:
import pandas as pd
import os
import ast
from tqdm import tqdm

In [3]:
# Load the existing landmark data
file_path = r"C:\Users\Ignacio\IronHackCodes\gitHStuff\FinalProj\processed_landmarks_final_cleaned.csv"
df_landmarks = pd.read_csv(file_path)

# Display the first few rows to understand structure
df_landmarks.head()

Unnamed: 0,Title,Summary,Latitude,Longitude,Cleaned Summary
0,Academia del Perpetuo Socorro,Mrs. Jeannette Sánchez (1-6)\n Academia del Pe...,18.45444,-66.08472,Mrs. Jeannette Sánchez (1-6) Academia del Perp...
1,Academia Interamericana Metro,TheAcademia Interamericana Metro(beforeAcademi...,18.448531,-66.072122,The Academia Interamericana Metro (before Acad...
2,Academia Maria Reina,Academia Maria Reinais a Catholic middle (7th ...,18.383442,-66.085516,Academia Maria Reina is a Catholic middle (7th...
3,Academia San Jorge,"Academia San Jorge(""Saint George Academy"") is ...",18.45056,-66.06167,"Academia San Jorge (""Saint George Academy"") is..."
4,Adjuntas barrio-pueblo,Adjuntas barrio-pueblois abarrioand the admini...,18.163776,-66.723544,Adjuntas barrio-pueblo is a barrio and the adm...


In [5]:
# Define category keywords
category_map = {
    "Nature": ["forest", "park", "reserve", "river", "waterfall", "mountain", "trail"],
    "Historical": ["fort", "castle", "colonial", "UNESCO", "battle", "historic"],
    "Beaches": ["beach", "coast", "bay", "island"],
    "Museums": ["museum", "exhibit", "gallery", "art", "science"],
    "Religious": ["church", "cathedral", "basilica", "monastery"],
    "Urban": ["city", "square", "plaza", "downtown"],
    "Adventure": ["hiking", "diving", "surfing", "zipline"]
}

def assign_category(summary):
    """Assign categories based on keyword matches in the summary."""
    categories = set()
    for category, keywords in category_map.items():
        if any(word in summary.lower() for word in keywords):
            categories.add(category)
    
    return ", ".join(categories) if categories else "Unknown"

# Apply category assignment
tqdm.pandas(desc="Assigning Categories")
df_landmarks["Category"] = df_landmarks["Cleaned Summary"].progress_apply(assign_category)

# Display updated dataframe
df_landmarks.head()

Assigning Categories: 100%|██████████| 520/520 [00:00<00:00, 14325.94it/s]


Unnamed: 0,Title,Summary,Latitude,Longitude,Cleaned Summary,Category
0,Academia del Perpetuo Socorro,Mrs. Jeannette Sánchez (1-6)\n Academia del Pe...,18.45444,-66.08472,Mrs. Jeannette Sánchez (1-6) Academia del Perp...,Urban
1,Academia Interamericana Metro,TheAcademia Interamericana Metro(beforeAcademi...,18.448531,-66.072122,The Academia Interamericana Metro (before Acad...,"Nature, Religious"
2,Academia Maria Reina,Academia Maria Reinais a Catholic middle (7th ...,18.383442,-66.085516,Academia Maria Reina is a Catholic middle (7th...,Unknown
3,Academia San Jorge,"Academia San Jorge(""Saint George Academy"") is ...",18.45056,-66.06167,"Academia San Jorge (""Saint George Academy"") is...","Nature, Museums, Religious"
4,Adjuntas barrio-pueblo,Adjuntas barrio-pueblois abarrioand the admini...,18.163776,-66.723544,Adjuntas barrio-pueblo is a barrio and the adm...,"Urban, Museums, Religious"


In [7]:
def extract_mentioned_landmarks(summary, landmark_titles):
    """Find references to other landmarks within the summary."""
    mentioned = [landmark for landmark in landmark_titles if landmark.lower() in summary.lower()]
    return ", ".join(mentioned) if mentioned else "None"

# Get list of all landmark titles
landmark_titles = df_landmarks["Title"].tolist()

# Extract mentioned landmarks
tqdm.pandas(desc="Extracting Mentioned Landmarks")
df_landmarks["Mentioned Landmarks"] = df_landmarks["Cleaned Summary"].progress_apply(
    lambda x: extract_mentioned_landmarks(x, landmark_titles)
)

df_landmarks.head()

Extracting Mentioned Landmarks: 100%|██████████| 520/520 [00:00<00:00, 926.41it/s] 


Unnamed: 0,Title,Summary,Latitude,Longitude,Cleaned Summary,Category,Mentioned Landmarks
0,Academia del Perpetuo Socorro,Mrs. Jeannette Sánchez (1-6)\n Academia del Pe...,18.45444,-66.08472,Mrs. Jeannette Sánchez (1-6) Academia del Perp...,Urban,"Academia del Perpetuo Socorro, Puerto Rico"
1,Academia Interamericana Metro,TheAcademia Interamericana Metro(beforeAcademi...,18.448531,-66.072122,The Academia Interamericana Metro (before Acad...,"Nature, Religious","Academia Interamericana Metro, Puerto Rico"
2,Academia Maria Reina,Academia Maria Reinais a Catholic middle (7th ...,18.383442,-66.085516,Academia Maria Reina is a Catholic middle (7th...,Unknown,"Academia Maria Reina, Puerto Rico"
3,Academia San Jorge,"Academia San Jorge(""Saint George Academy"") is ...",18.45056,-66.06167,"Academia San Jorge (""Saint George Academy"") is...","Nature, Museums, Religious","Academia San Jorge, Puerto Rico"
4,Adjuntas barrio-pueblo,Adjuntas barrio-pueblois abarrioand the admini...,18.163776,-66.723544,Adjuntas barrio-pueblo is a barrio and the adm...,"Urban, Museums, Religious","Adjuntas barrio-pueblo, Puerto Rico"


In [9]:
def determine_best_season(category):
    """Assigns best seasons based on category type."""
    if "Beaches" in category or "Nature" in category:
        return "Summer, Spring"
    elif "Historical" in category or "Museums" in category:
        return "Year-round"
    elif "Adventure" in category:
        return "Winter, Spring"
    else:
        return "Unknown"

# Apply function
tqdm.pandas(desc="Assigning Best Season to Visit")
df_landmarks["Best Season"] = df_landmarks["Category"].progress_apply(determine_best_season)

df_landmarks.head()

Assigning Best Season to Visit: 100%|██████████| 520/520 [00:00<00:00, 349245.49it/s]


Unnamed: 0,Title,Summary,Latitude,Longitude,Cleaned Summary,Category,Mentioned Landmarks,Best Season
0,Academia del Perpetuo Socorro,Mrs. Jeannette Sánchez (1-6)\n Academia del Pe...,18.45444,-66.08472,Mrs. Jeannette Sánchez (1-6) Academia del Perp...,Urban,"Academia del Perpetuo Socorro, Puerto Rico",Unknown
1,Academia Interamericana Metro,TheAcademia Interamericana Metro(beforeAcademi...,18.448531,-66.072122,The Academia Interamericana Metro (before Acad...,"Nature, Religious","Academia Interamericana Metro, Puerto Rico","Summer, Spring"
2,Academia Maria Reina,Academia Maria Reinais a Catholic middle (7th ...,18.383442,-66.085516,Academia Maria Reina is a Catholic middle (7th...,Unknown,"Academia Maria Reina, Puerto Rico",Unknown
3,Academia San Jorge,"Academia San Jorge(""Saint George Academy"") is ...",18.45056,-66.06167,"Academia San Jorge (""Saint George Academy"") is...","Nature, Museums, Religious","Academia San Jorge, Puerto Rico","Summer, Spring"
4,Adjuntas barrio-pueblo,Adjuntas barrio-pueblois abarrioand the admini...,18.163776,-66.723544,Adjuntas barrio-pueblo is a barrio and the adm...,"Urban, Museums, Religious","Adjuntas barrio-pueblo, Puerto Rico",Year-round


In [11]:
def assign_weather_dependency(category):
    """Determines if a location is affected by weather conditions."""
    if "Beaches" in category or "Nature" in category or "Adventure" in category:
        return "Yes"
    else:
        return "No"

# Apply function
tqdm.pandas(desc="Determining Weather Dependency")
df_landmarks["Weather Dependency"] = df_landmarks["Category"].progress_apply(assign_weather_dependency)

df_landmarks.head()

Determining Weather Dependency: 100%|██████████| 520/520 [00:00<?, ?it/s]


Unnamed: 0,Title,Summary,Latitude,Longitude,Cleaned Summary,Category,Mentioned Landmarks,Best Season,Weather Dependency
0,Academia del Perpetuo Socorro,Mrs. Jeannette Sánchez (1-6)\n Academia del Pe...,18.45444,-66.08472,Mrs. Jeannette Sánchez (1-6) Academia del Perp...,Urban,"Academia del Perpetuo Socorro, Puerto Rico",Unknown,No
1,Academia Interamericana Metro,TheAcademia Interamericana Metro(beforeAcademi...,18.448531,-66.072122,The Academia Interamericana Metro (before Acad...,"Nature, Religious","Academia Interamericana Metro, Puerto Rico","Summer, Spring",Yes
2,Academia Maria Reina,Academia Maria Reinais a Catholic middle (7th ...,18.383442,-66.085516,Academia Maria Reina is a Catholic middle (7th...,Unknown,"Academia Maria Reina, Puerto Rico",Unknown,No
3,Academia San Jorge,"Academia San Jorge(""Saint George Academy"") is ...",18.45056,-66.06167,"Academia San Jorge (""Saint George Academy"") is...","Nature, Museums, Religious","Academia San Jorge, Puerto Rico","Summer, Spring",Yes
4,Adjuntas barrio-pueblo,Adjuntas barrio-pueblois abarrioand the admini...,18.163776,-66.723544,Adjuntas barrio-pueblo is a barrio and the adm...,"Urban, Museums, Religious","Adjuntas barrio-pueblo, Puerto Rico",Year-round,No


In [13]:
# Define save path
save_path = r"C:\Users\Ignacio\IronHackCodes\gitHStuff\FinalProj\processed_landmarks_with_metadata.csv"

# Save to CSV
df_landmarks.to_csv(save_path, index=False)

print(f"Updated dataset saved to: {save_path}")

Updated dataset saved to: C:\Users\Ignacio\IronHackCodes\gitHStuff\FinalProj\processed_landmarks_with_metadata.csv
