Step 0: Install required libraries

In [None]:
%pip install bs4
%pip install requests
%pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import nltk
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import contractions
import sys
import spacy
nlp = spacy.load("en_core_web_sm")
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
import model.language_manager.lang_detector as LanguageDetector
import model.language_manager.lang_translator as LanguageTranslator
import model.language_manager.lang_spelling as LanguageSpellChecker
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stopwords_dict = {"english": set(stopwords.words("english"))}
lemmatizer = WordNetLemmatizer()
tag_definitions = {
"TOURISM_ACTIVITY": "tour, visit, explore, sightseeing, activity, adventure",
"ACCOMMODATION": "hotel, resort, hostel, stay, accommodation, lodge",
"TRANSPORTATION": "flight, train, bus, transport, airport, travel",
"DESTINATION": "beach, city, location, destination, attraction",
"FOOD": "restaurant, local cuisine, dish, food, dining",
"BOOKING": "book, reservation, schedule, availability, check-in",
"COST": "price, fee, rate, budget, cost, charge",
"SAFETY": "safe, emergency, secure, risk, health",
"SEASONALITY": "weather, season, best time, climate",
"CULTURE": "tradition, history, cultural, heritage, museum"
}

tag_sentences = list(tag_definitions.values())
tag_labels = list(tag_definitions.keys())

tfidf_vectorizer = TfidfVectorizer()
tag_vectors = tfidf_vectorizer.fit_transform(tag_sentences)

# Tourism relevance setup
tourism_keywords = ["flight", "hotel", "trip", "tour", "travel", "vacation", "sightseeing", "beach"]
relaxed_terms = ["place to stay", "book a trip", "tourist destination"]
tourism_reference = " ".join(tourism_keywords + relaxed_terms)

Step 1: Prompt the user for a URL

In [None]:
print("Step 1: Input URL")
url = input("Enter the website URL: ").strip()

Step 2: Fetch HTML content

In [None]:
print("Step 2: Fetching webpage...")

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
response.raise_for_status()
html_content = response.text
print("Webpage fetched successfully!\n")

Step 3: Parse the HTML

In [None]:
print("Step 3: Parsing HTML...")
soup = BeautifulSoup(html_content, 'html.parser')

Step 4: Extracting paragraphs

In [None]:
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]

Step 5: Split paragraphs into sentences

In [None]:
print("Step 5: Splitting into sentences...")

sentences = []
for para in paragraphs:
    sentences.extend(nltk.sent_tokenize(para))

# ✅ Create structured data (index + sentence)
sentence_data = [
    {"sentence": sentence}
    for i, sentence in enumerate(sentences)
]

print(f" Extracted {len(sentence_data)} sentences.\n")

Step6 clean data


In [None]:
# Step 6: Clean and Filter Sentences (Improved Version)
processed_sentences = set()
cleaned_sentences = []

def preserve_place_names(sentence):
    doc = nlp(sentence)
    placeholder_map = {}
    for i, ent in enumerate(doc.ents):
        if ent.label_ in {"GPE", "LOC", "FAC"}:  # GPE: countries, cities, states; LOC: non-GPE locations; FAC: buildings, airports, etc.
            placeholder = f"__PLACE_{i}__"
            original_text = ent.text
            sentence = sentence.replace(original_text, placeholder)
            placeholder_map[placeholder] = original_text
    return sentence, placeholder_map

def restore_place_names(sentence, placeholder_map):
    for placeholder, original in placeholder_map.items():
        sentence = sentence.replace(placeholder, original)
    return sentence
def assign_tag(cleaned_text):
    sentence_vector = tfidf_vectorizer.transform([cleaned_text])
    similarities = cosine_similarity(sentence_vector, tag_vectors)
    best_idx = similarities.argmax()
    return tag_labels[best_idx]
def clean_data(entry, idx, total):
    sentence = entry["sentence"].strip()
    if not sentence or sentence in processed_sentences or sentence.isdigit():
        return None

    print(f"\n[{idx+1}/{total}] Original: {sentence}")
    detected_lang = LanguageDetector.detect_language(sentence)

    # 🔒 Protect named entities
    safe_sentence, placeholder_map = preserve_place_names(sentence)

    if detected_lang != "en":
        translated = str(LanguageTranslator.translate_text(safe_sentence, "en"))
    else:
        translated = safe_sentence

    corrected = LanguageSpellChecker.correct_spelling(translated)
    restored = restore_place_names(corrected, placeholder_map)

    print(f"→ Translated + corrected: {restored}")
    sentence = contractions.fix(restored)

    # ✂ Tokenize and clean
    tokens = word_tokenize(sentence.lower())
    filtered = [w for w in tokens if w.isalnum() and w not in stopwords_dict["english"]]
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    if len(lemmatized) < 3:
        print("→ Skipped: too short after cleaning")
        return None

    final = " ".join(lemmatized)
    if final in processed_sentences:
        print("→ Skipped: duplicate")
        return None

    processed_sentences.add(final)
    tag = assign_tag(final)
    print(f"→ Final cleaned: {final} [TAG: {tag}]")
    return {"tag": tag ,"sentence": final}


# Run cleaning
total = len(sentence_data)
for idx, entry in enumerate(sentence_data):
    result = clean_data(entry, idx, total)
    if result:
        cleaned_sentences.append(result)

sentence_data = cleaned_sentences
print(f"✅ Cleaned and retained {len(sentence_data)} tourism-related sentences.\n")


Step 6: Save to JSON

In [None]:
# Prepare output directory
output_dir = "../../data/processed"
os.makedirs(output_dir, exist_ok=True)

# Clean domain name and manually join the path using forward slashes
domain = urlparse(url).netloc.replace('.', '_')
output_file_path = f"{output_dir}/processed_{domain}.json"

# Save structured list
with open(output_file_path, "w", encoding="utf-8") as f:
    json.dump(sentence_data, f, indent=4, ensure_ascii=False)

print(f"✅ Processed JSON data saved to {output_file_path}")