# TXT to JSON Processor (Unstructured Data)

This notebook processes a TXT file, extracts its text, splits the text into sentences, performs data cleaning, and translation (if needed), and converts the result into a JSON file.

### Step 1: Import Necessary Libraries

In [None]:
%pip install nltk contractions

In [None]:
import os
import json

import nltk
import contractions
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tkinter as tk
from tkinter import filedialog
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))
import model.language_manager.lang_detector as LanguageDetector
import model.language_manager.lang_translator as LanguageTranslator
import model.language_manager.lang_spelling as LanguageSpellChecker

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

### Step 2: Load the TXT File

In [None]:
selected_file_path = None  

lemmatizer = WordNetLemmatizer()
def select_file():
    global selected_file_path
    file_path = filedialog.askopenfilename(
        title="Select a TXT file",
        filetypes=(("Text Files", "*.txt"), ("All Files", "*.*"))
    )

    if file_path:
        selected_file_path = file_path
        label.config(text=os.path.basename(file_path))  # Show only filename
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            text_widget.delete(1.0, tk.END)
            text_widget.insert(tk.END, content)
        button_confirm.pack(pady=10)
    else:
        label.config(text="No file selected")
        button_confirm.pack_forget()

def confirm_file():
    window.destroy()  # Close the GUI

# --- GUI SETUP ---
window = tk.Tk()
window.title("Select a TXT File")
window.geometry("700x400+100+100")

label = tk.Label(window, text="No file selected", width=80)
label.pack(pady=20)

button_select = tk.Button(window, text="Select File", command=select_file)
button_select.pack(pady=10)

button_confirm = tk.Button(window, text="Load File", command=confirm_file)

text_widget = tk.Text(window, width=80, height=10)
text_widget.pack(pady=10)

window.mainloop()

# --- After GUI closes ---
if selected_file_path:
    print(f"\nConfirmed TXT file path: {selected_file_path}")
else:
    print("\nNo file selected.")


### Step 3: Extract Text from the TXT

In [None]:
# Function to extract text from TXT
def extract_text_from_txt(txt_path):
    text = ""
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# Extract text from the TXT
text = extract_text_from_txt(selected_file_path)

# Display a snippet of the extracted text
print("Extracted Text (First 500 characters):\n")
print(text[:500])

### Step 4: Split Text into Sentences

In [None]:
# Before sentence tokenization
text = contractions.fix(text)

# Split the text into sentences
sentences = sent_tokenize(text)

lines = text.strip().split('\n')  # Split the entire text by line

# Build structured sentence data from each line (Tag + Sentence)
sentence_data = [
    {
        "tag": parts[0].upper(),
        "sentence": parts[1].strip()
    }
    for i, line in enumerate(lines)
    if (parts := line.strip().split(" ", 1)) and len(parts) == 2
]

# Display the first few sentences
print("Extracted Sentences:")
for entry in sentence_data[:5]:
    print(f"Tag: {entry['tag']}\nSentence: {entry['sentence']}\n")

### Step 5: Perform Data Cleaning and Translation

In [None]:
# Initialize NLP tools
stopwords_dict = {"english": set(stopwords.words("english"))}
lemmatizers = {"english": WordNetLemmatizer()}
processed_sentences = set()
cleaned_data = []

def preserve_place_names(sentence):
    """
    Detect consecutive title-cased words (likely proper nouns) and replace with placeholders.
    """
    words = sentence.split()
    placeholder_map = {}
    new_words = []
    i = 0
    while i < len(words):
        if words[i].istitle() and len(words[i]) > 2:
            phrase = [words[i]]
            j = i + 1
            while j < len(words) and words[j].istitle() and len(words[j]) > 2:
                phrase.append(words[j])
                j += 1
            if len(phrase) >= 1:
                placeholder = f"__PLACE_{len(placeholder_map)}__"
                original_phrase = " ".join(phrase)
                placeholder_map[placeholder] = original_phrase
                new_words.append(placeholder)
                i = j
                continue
        new_words.append(words[i])
        i += 1
    return " ".join(new_words), placeholder_map


def restore_place_names(sentence, placeholder_map):
    for placeholder, original in placeholder_map.items():
        sentence = sentence.replace(placeholder, original)
    return sentence


def clean_data(entry, idx, total):
    sentence = entry["sentence"].strip()
    
    if not sentence or sentence in processed_sentences:
        return None

    processed_sentences.add(sentence)

    print(f"\nProcessing sentence {idx+1}/{total}:")
    print(f"Original: {sentence}")

    # ⬇️ STEP 1: Detect Language FIRST
    detected_lang = LanguageDetector.detect_language(sentence)
    print(f"Detected Language: {detected_lang}")

    # ⬇️ STEP 2: Preserve places BEFORE translation
    if detected_lang != "en":
        safe_sentence, placeholder_map = preserve_place_names(sentence)
        translated = str(LanguageTranslator.translate_text(safe_sentence, "en"))
    # ⬇️ Apply spelling correction to the TRANSLATED string with placeholders
        translated = LanguageSpellChecker.correct_spelling(translated)
        # ⬇️ Then restore original place names (so they aren't corrupted)
        sentence = restore_place_names(translated, placeholder_map)
        # 🔄 Expand contractions like it's → it is
        sentence = contractions.fix(sentence)
        print(f"Translated, corrected, and expanded: {sentence}")
    else:
        sentence = LanguageSpellChecker.correct_spelling(sentence)
        sentence = contractions.fix(sentence)
        print(f"Corrected & expanded: {sentence}")

    # 🔧 Tokenize after all corrections (applies to both translated and native English)
    tokens = word_tokenize(sentence.lower())
    filtered_tokens = [
        word for word in tokens
        if (
            word.isalnum() or any(char in word for char in "!@#$%^&*()-_=+[]{}|;:'\",.<>?")
        ) and word not in stopwords_dict["english"]
    ]


    lemmatized_tokens = [lemmatizers["english"].lemmatize(word) for word in filtered_tokens]

    if not lemmatized_tokens:
        print("Skipping sentence after lemmatization as it's empty.")
        return None

    lemmatized_sentence = " ".join(lemmatized_tokens)

    if lemmatized_sentence in processed_sentences:
        return None

    processed_sentences.add(lemmatized_sentence)
    entry["sentence"] = lemmatized_sentence
    print(f"Lemmatized and processed sentence: {lemmatized_sentence}")
    return entry


for idx, entry in enumerate(sentence_data):
    result = clean_data(entry, idx, len(sentence_data)) 

    if result:
        cleaned_data.append(result)

# Replace original sentence_data with cleaned_data
sentence_data = [entry for entry in cleaned_data if entry["sentence"].strip()]

print("Preprocessing completed.\n")
print("Sample of cleaned and corrected sentences:")
for entry in sentence_data[:5]:
    print(f"[{entry['tag']}] {entry['sentence']}")


### Step 6: Save the Sentences to a JSON File

In [None]:
selected_filename = os.path.basename(selected_file_path) 
selected_filename_wo_ext = os.path.splitext(selected_filename)[0]

# Now build a safe output path
output_file_path = f"../../data/processed/processed_{selected_filename_wo_ext}.json"

# Ensure the directory exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Save the sentences to a JSON file
with open(output_file_path, "w", encoding="utf-8") as json_file:
    json.dump(sentence_data, json_file, indent=4, ensure_ascii=False)

print(f"Processed JSON data saved to {output_file_path}")


### - END