# PDF to JSON Processor (Unstructured Data)

This notebook processes a PDF file, extracts its text (using OCR if necessary), splits the text into sentences, performs data cleaning, translation (if needed), and converts the result into a JSON file.

### Step 1: Import Necessary Libraries

In [None]:
%pip install PyPDF2 pytesseract pdf2image nltk
%pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import os
import json
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
import contractions
import nltk
import tkinter as tk
from tkinter import filedialog, messagebox
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from nltk import WordNetLemmatizer
import tkinter as tk
import os
import json
import spacy
nlp = spacy.load("en_core_web_sm")

import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
import model.language_manager.lang_detector as LanguageDetector
import model.language_manager.lang_translator as LanguageTranslator
import model.language_manager.lang_spelling as LanguageSpellChecker

nltk.download('punkt')

### Step 2: Load the PDF File

In [None]:
# Tourism relevance settings
tourism_keywords = ["flight", "hotel", "trip", "tour", "travel", "tourist"]
relaxed_terms = {
    "day", "weather", "recommend", "place", "where", "when", "how much", "suggest",
    "good", "beautiful", "scenery", "things to do", "what to see", "how to get", "trip"
}
tourism_reference = [
    "travel", "tourism", "flight", "hotel", "vacation", "trip", 
    "tourist", "beach", "accommodation", "sightseeing", "cruise", "spot"
]
tag_definitions = {
"TOURISM_ACTIVITY": "tour, visit, explore, sightseeing, activity, adventure",
"ACCOMMODATION": "hotel, resort, hostel, stay, accommodation, lodge",
"TRANSPORTATION": "flight, train, bus, transport, airport, travel",
"DESTINATION": "beach, city, location, destination, attraction",
"FOOD": "restaurant, local cuisine, dish, food, dining",
"BOOKING": "book, reservation, schedule, availability, check-in",
"COST": "price, fee, rate, budget, cost, charge",
"SAFETY": "safe, emergency, secure, risk, health",
"SEASONALITY": "weather, season, best time, climate",
"CULTURE": "tradition, history, cultural, heritage, museum"
}

tag_sentences = list(tag_definitions.values())
tag_labels = list(tag_definitions.keys())

tfidf_vectorizer = TfidfVectorizer()
tag_vectors = tfidf_vectorizer.fit_transform(tag_sentences)
lemmatizer = WordNetLemmatizer()
# --- GUI to select a PDF file ---
selected_pdf_path = None

def select_file():
    global selected_pdf_path

    path = filedialog.askopenfilename(
        title="Select a PDF File",
        filetypes=[("PDF Files", "*.pdf"), ("All Files", "*.*")]
    )

    if path:
        selected_pdf_path = path
        label.config(text=os.path.basename(path))
        text_widget.delete(1.0, tk.END)
        text_widget.insert(tk.END, f"Selected File:\n{selected_pdf_path}")
        button_confirm.pack(pady=10)
    else:
        label.config(text="No file selected")
        text_widget.delete(1.0, tk.END)
        button_confirm.pack_forget()

def confirm_file():
    if selected_pdf_path:
        window.destroy()
    else:
        messagebox.showwarning("No Selection", "Please select a file before confirming.")

# --- GUI Setup ---
window = tk.Tk()
window.title("Select a PDF File")
window.geometry("800x400+100+100")

label = tk.Label(window, text="No file selected", width=100)
label.pack(pady=20)

button_select = tk.Button(window, text="Select File", command=select_file)
button_select.pack(pady=10)

button_confirm = tk.Button(window, text="Load File", command=confirm_file)

text_widget = tk.Text(window, width=100, height=10)
text_widget.pack(pady=10)

window.mainloop()

# --- After GUI closes ---
if selected_pdf_path:
    print(f"\n✅ Confirmed PDF file path: {selected_pdf_path}")
else:
    print("\n❌ No file selected.")

Prompt for Page Range

In [None]:
# Create a PdfReader instance
reader = PdfReader(selected_pdf_path)

# Check the number of pages in the PDF
num_pages = len(reader.pages)

if num_pages > 1:
    # Prompt the user for page range
    page_range_input = input("Do you want to extract text from all pages? (Y/N): ").strip().lower()

    if page_range_input == "n":
        start_page = int(input("Enter the starting page number (must be at least 1): "))
        end_page = int(input("Enter the ending page number: "))
        
        # Validate the page range
        if start_page < 1:
            raise ValueError("Starting page number must be at least 1.")
        if end_page < start_page:
            raise ValueError("Ending page number must be greater than or equal to the starting page number.")
        if end_page > num_pages:
            raise ValueError(f"Ending page number must be at most {num_pages}.")
    else:
        start_page, end_page = None, None
else:
    print("The PDF has only one page. Extracting text from the single page.")
    start_page, end_page = None, None

### Step 3: Extract Text from the PDF

In [None]:
# Function to extract text from PDF with optional page range
def extract_text_from_pdf(pdf_path, start_page=None, end_page=None):
    text = ""
    reader = PdfReader(pdf_path)
    
    # Determine the range of pages to process
    pages_to_process = reader.pages
    if start_page is not None and end_page is not None:
        pages_to_process = reader.pages[start_page - 1:end_page]
    
    # Extract text from the specified pages
    for page in pages_to_process:
        text += page.extract_text() or ""
    return text

# Extract text from the PDF based on the page range
text = extract_text_from_pdf(selected_pdf_path, start_page, end_page)

# If no text is extracted, use OCR
if not text.strip():
    print("No text found in the PDF. Attempting OCR...")
    images = convert_from_path(selected_pdf_path)
    if start_page is not None and end_page is not None:
        images = images[start_page - 1:end_page]
    text = "\n".join([pytesseract.image_to_string(image) for image in images])

# Display a snippet of the extracted text
print("Extracted Text (First 500 characters):\n")
print(text[:500])

### Step 4: Split Text into Sentences

In [None]:
# Split the text into sentences
sentences = sent_tokenize(text)

# Convert sentences into a DataFrame-like structure
sentence_data = [{"sentence": sentence} for i, sentence in enumerate(sentences)]

# Display the first few sentences
print("Extracted Sentences:")
print(sentence_data[:5])

### Step 5: Perform Data Cleaning and Translation

In [None]:
# Step 5: Perform Data Cleaning and Translation
stopwords_dict = {"english": set(stopwords.words("english"))}
lemmatizers = {"english": WordNetLemmatizer()}
processed_sentences = set()
cleaned_data = []

def preserve_place_names(sentence):
    doc = nlp(sentence)
    placeholder_map = {}
    for i, ent in enumerate(doc.ents):
        if ent.label_ in {"GPE", "LOC", "FAC"}:  # GPE: countries, cities, states; LOC: non-GPE locations; FAC: buildings, airports, etc.
            placeholder = f"__PLACE_{i}__"
            original_text = ent.text
            sentence = sentence.replace(original_text, placeholder)
            placeholder_map[placeholder] = original_text
    return sentence, placeholder_map
def assign_tag(cleaned_text):
    sentence_vector = tfidf_vectorizer.transform([cleaned_text])
    similarities = cosine_similarity(sentence_vector, tag_vectors)
    best_idx = similarities.argmax()
    return tag_labels[best_idx]

def restore_place_names(sentence, placeholder_map):
    for placeholder, original in placeholder_map.items():
        sentence = sentence.replace(placeholder, original)
    return sentence


def clean_data(entry, idx, total):
    sentence = entry["sentence"].strip()
    if not sentence or sentence in processed_sentences or sentence.isdigit():
        return None

    print(f"\n[{idx+1}/{total}] Original: {sentence}")
    detected_lang = LanguageDetector.detect_language(sentence)

    # 🔒 Protect named entities before translation and spell correction
    safe_sentence, placeholder_map = preserve_place_names(sentence)

    if detected_lang != "en":
        translated = str(LanguageTranslator.translate_text(safe_sentence, "en"))
    else:
        translated = safe_sentence

    corrected = LanguageSpellChecker.correct_spelling(translated)
    restored = restore_place_names(corrected, placeholder_map)

    print(f"→ Translated + corrected: {restored}")
    sentence = contractions.fix(restored)

    # ✂ Tokenize and clean
    tokens = word_tokenize(sentence.lower())
    filtered = [w for w in tokens if w.isalnum() and w not in stopwords_dict["english"]]
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    if len(lemmatized) < 3:
        print("→ Skipped: too short after cleaning")
        return None

    final = " ".join(lemmatized)
    if final in processed_sentences:
        print("→ Skipped: duplicate")
        return None

    processed_sentences.add(final)
    print(f"→ Final cleaned: {final}")
    return {
        "tag": assign_tag(sentence),
        "sentence": final
    }


# Run cleaning
total = len(sentence_data)
for idx, entry in enumerate(sentence_data):
    result = clean_data(entry, idx, total)
    if result:
        cleaned_data.append(result)

sentence_data = cleaned_data


### Step 6: Save the Sentences to a JSON File

In [None]:
pdf_filename_only = os.path.splitext(os.path.basename(selected_pdf_path))[0]
output_file_path = f"../../data/processed/processed_{pdf_filename_only}.json"

# --- Ensure the output directory exists ---
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# --- Save sentences to JSON file ---
with open(output_file_path, "w", encoding="utf-8") as json_file:
    json.dump(sentence_data, json_file, indent=4, ensure_ascii=False)

print(f"✅ Processed JSON data saved to {output_file_path}")

### - END