In [1]:
import re
import pandas as pd
import gdown as gd

## Parsing the data

In [2]:
# url = "https://docs.google.com/document/d/1PpslvvmMsybqaGL4hVXW1PJjaRDk9hqbFm2-2NctA3c/edit?tab=t.0"
# id = "1PpslvvmMsybqaGL4hVXW1PJjaRDk9hqbFm2-2NctA3c"

In [7]:
def clean_text_file(input_path):
    cleaned_lines = []

    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            # Strip whitespace
            line = line.strip().lstrip("* ")

            # Skip completely empty lines
            if not line:
                continue

            # Replace multiple spaces with single spaces
            while "  " in line:
                line = line.replace("  ", " ")

            cleaned_lines.append(line)

    # Write cleaned text
    with open("Cleaned_Tourist.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned_lines))



In [8]:
url = r"C:\Users\owner\Desktop\Files_Deep_Learning\data.txt"
clean_text_file(url )

In [3]:
# def load_data(i_d):
#     gd.download(id=i_d, output="TouristAI.txt", quiet=False)
#     with open("TouristAI.txt", "r", encoding='utf-8', errors='ignore') as f:
#         content = f.read()
#     return content

def load_data(file_path):
    with open (file_path, "r", encoding='utf-8') as f:
        content = f.read()
        return content
    
file_path = r"C:\Users\ncc333\Desktop\Deep_Learning\data.txt"

content = load_data(file_path)

In [4]:


def parse_attractions(content):
    # Split entries based on numbered headings like "1. "
    entries = re.split(r"\n\d+\.\s+", content)[1:]  # ignore first empty split

    parsed_list = []

    for entry in entries:
        lines = entry.strip().split("\n")

        data = {
            "title": lines[0].strip(),    # first line is "Lekki Conservation Centre (LCC)"
            "name": "",
            "state": "",
            "city": "",
            "lga": "",
            "description": "",
            "entry_fees": [],
            "opening_hours": "",
            "gps_lat": None,
            "gps_lon": None,
            "history": "",
            "best_time_to_visit": "",
            "safety_info": "",
            "contact_info": "",
            "available_activities": "",
            "address": ""
        }

        i = 1
        while i < len(lines):
            line = lines[i].strip()

            #will condense the multiple elif statements using mapping later

            if line.startswith("Name:"):
                data["name"] = line.replace("Name:", "").strip()

            elif line.startswith("State / City / LGA:"):
                parts = line.replace("State / City / LGA:", "").split("/")
                data["state"] = parts[0].strip()
                data["city"] = parts[1].strip()
                data["lga"] = parts[2].strip()

            elif line.startswith("Description:"):
                data["description"] = line.replace("Description:", "").strip()

            elif line.startswith("Entry fees"):
                i += 1
                while i < len(lines) and not lines[i].startswith("Opening"):
                    if lines[i].strip():
                        data["entry_fees"].append(lines[i].strip())
                    i += 1
                continue  # prevent double increment

            elif line.startswith("Opening hours:"):
                data["opening_hours"] = line.replace("Opening hours:", "").strip()

            elif line.startswith("GPS coordinates:"):
                coords = re.findall(r"[-+]?\d*\.\d+|\d+", line)
                if len(coords) >= 2:
                    data["gps_lat"] = float(coords[0])
                    data["gps_lon"] = float(coords[1])

            elif line.startswith("History:"):
                data['history'] = line.replace("History:", "").strip()
            
            elif line.startswith("Best time to visit:"):
                data["best_time_to_visit"] = line.replace("Best time to visit:", "").strip()

            elif line.startswith("Safety info:"):
                data["safety_info"] = line.replace("Safety info:", "").strip()
            
            elif line.startswith("Contact info:"):
                data['contact_info'] = line.replace("Contact info:", "").strip()
            
            elif line.startswith("Available activities:"):
                data["available_activities"] = line.replace("Available activities:", "").strip()

            elif line.startswith("Address/Access info:"):
                data["address"] = line.replace("Address/Access info:", "").strip()
            i += 1
            
            
        

        parsed_list.append(data)

    return parsed_list


In [5]:
parsed= parse_attractions(content)
parsed

[{'title': 'Nike Art Gallery',
  'name': 'Nike Centre for Art and Culture',
  'state': 'Lagos State',
  'city': 'Lekki Phase 1',
  'lga': 'Eti-Osa LGA',
  'description': "West Africa's largest art gallery, housed in a magnificent four-story building. It holds thousands of artworks including paintings, textiles, and metalwork.",
  'entry_fees': [],
  'opening_hours': 'Daily, 10:00 AM – 6:00 PM',
  'gps_lat': 6.4526,
  'gps_lon': 3.4851,
  'history': 'Founded by Chief Nike Davies-Okundaye, a world-renowned batik and textile artist, to promote Nigerian culture and empower artists.',
  'best_time_to_visit': 'Weekdays for a quieter experience; weekends if you want to meet other art lovers.',
  'safety_info': 'Very safe location. Be careful with delicate artworks when moving through narrow aisles.',
  'contact_info': '+234 803 409 6656',
  'available_activities': 'Art appreciation, purchasing art, meeting the artist (Chief Nike is often there), textile workshops (by appointment).',
  'addres

##  Cleaning Parsed data

In [6]:
def clean_parsed_attractions(parsed_list):
    """
    Clean and normalize parsed attraction dictionaries.
    - Strip extra whitespace and colons
    - Fix malformed fields
    - Replace empty strings with None
    - Normalize entry fees
    - Normalize available activities
    """
    cleaned = []

    for item in parsed_list:
        data = item.copy()

        # Normalize simple text fields
        for field in [
            "title", "name", "state", "city", "lga", "description",
            "opening_hours", "history", "best_time_to_visit",
            "safety_info", "contact_info", "available_activities",
            "address"
        ]:
            if isinstance(data[field], str):
                # Remove extra spaces
                cleaned_text = data[field].strip()

                # Replace empty string with None
                data[field] = cleaned_text if cleaned_text else None

        # Fix entry fees 
        if isinstance(data["entry_fees"], list):
            cleaned_fees = [fee.strip(" -*\u2022") for fee in data["entry_fees"]]
            cleaned_fees = [fee for fee in cleaned_fees if fee]  # remove empties
            
            data["entry_fees"] = cleaned_fees if cleaned_fees else None

        # Normalize GPS coordinates 
        # If lat or lon missing, set to None
        if not isinstance(data.get("gps_lat"), (int, float)):
            data["gps_lat"] = None
        if not isinstance(data.get("gps_lon"), (int, float)):
            data["gps_lon"] = None


        cleaned.append(data)

    return cleaned


In [7]:
cleaned = clean_parsed_attractions(parsed)

## Saving to json

In [8]:
import json
with open("cleaned_tourist.json", "w", encoding="utf-8") as f:
    json.dump(cleaned, f, indent=4, ensure_ascii=False)