In [3]:
###
# CELL 1: IMPORTS, CLEANING FUNCTION, AND TESTING
###

import os  # <-- The import is now in the same cell
import re  # 're' is the regular expression library
import sys

print("--- Step 1: Imports successful ---\n")

def clean_line(text):
    """
    Cleans a single line of text from the corpus.
    - Removes the starting line number (e.g., "1 \t")
    - Converts to lowercase
    - Removes all punctuation, symbols, and standalone numbers
    - Normalizes whitespace
    """
    
    # 1. Strip the line number (e.g., "1 \t" or "10 \t")
    match = re.search(r'^\d+\t(.*)', text)
    if match:
        text = match.group(1)
    
    # 2. Force to lowercase
    text = text.lower()
    
    # 3. Remove punctuation, symbols, and numbers
    # This regex [^a-z\s] means "find anything that is NOT (^) a letter (a-z) or whitespace (\s)"
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 4. Normalize whitespace (replace multiple spaces/tabs with one)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# --- Now, let's run the test ---

# Construct the path to the corpus file
# '..' means "go up one directory" (from 'notebooks' to 'ai_model')
data_path = os.path.join('..', 'data', 'corpus.txt')

print(f"--- Step 2: Testing the cleaning function on: {data_path} ---\n")

try:
    with open(data_path, 'r', encoding='utf-8') as f:
        for i in range(20):
            line = f.readline()
            if not line:
                break # Stop if the file is shorter than 20 lines
            
            # Get the original line (and strip whitespace)
            original_line = line.strip()
            
            # Get the cleaned version
            cleaned_line = clean_line(original_line)
            
            # Print the comparison
            print(f"Original: {original_line}")
            print(f"Cleaned : {cleaned_line}\n")
    
    print("\n--- Step 3: Test complete ---")

except FileNotFoundError:
    print(f"ERROR: File not found at {data_path}")
    print("Please make sure 'corpus.txt' is in the 'ai_model/data/' folder.")
except Exception as e:
    print(f"An error occurred: {e}")



--- Step 1: Imports successful ---

--- Step 2: Testing the cleaning function on: ..\data\corpus.txt ---

Original: 1	£0.35 Waiting time Tariff 3 - Each period of one minute or part.
Cleaned : waiting time tariff each period of one minute or part

Original: 2	'05 Christian - Catholic' relabelled as '05 Christian - Roman Catholic' for consistency with census labels.
Cleaned : christian catholic relabelled as christian roman catholic for consistency with census labels

Original: 3	£0.87 per night, per person.
Cleaned : per night per person

Original: 4	½ mile south of the town, which is included in the Heritage Walk (Vale Trail 9).
Cleaned : mile south of the town which is included in the heritage walk vale trail

Original: 5	¾ mile you will see the entrance to Tŷ Cerrig Woodland Retreats on your left.
Cleaned : mile you will see the entrance to t cerrig woodland retreats on your left

Original: 6	½" x ø15 setting 4 bar with --- - short temperature probe.
Cleaned : x setting bar with sho

In [4]:
###
# CELL 2: "NOISING" FUNCTION AND TESTING
###

import random

def add_noise_to_sentence(sentence, noise_level=0.15):
    """
    Takes a clean sentence and randomly introduces typos (noise).
    
    noise_level: The probability (e.g., 0.15 = 15%) that a word will be "noised".
    """
    
    words = sentence.split()
    new_sentence_words = []
    
    # All possible letters for insertion/substitution
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    
    for word in words:
        # We only add noise if the word is long enough AND
        # a random chance (between 0.0 and 1.0) is below our noise level
        if random.random() < noise_level and len(word) > 3:
            
            # 1. Randomly pick a type of typo
            typo_type = random.choice(['delete', 'insert', 'substitute', 'swap'])
            
            if typo_type == 'delete':
                # Pick a random position to delete
                pos = random.randint(0, len(word) - 1)
                noised_word = word[:pos] + word[pos+1:]
            
            elif typo_type == 'insert':
                # Pick a random position to insert
                pos = random.randint(0, len(word))
                # Pick a random letter
                char = random.choice(alphabet)
                noised_word = word[:pos] + char + word[pos:]
            
            elif typo_type == 'substitute':
                # Pick a random position to substitute
                pos = random.randint(0, len(word) - 1)
                # Pick a random replacement letter
                char = random.choice(alphabet)
                noised_word = word[:pos] + char + word[pos+1:]
            
            elif typo_type == 'swap':
                # Pick a random position to swap with the *next* letter
                # We stop at len(word) - 2 to ensure there is a next letter
                if len(word) > 1:
                    pos = random.randint(0, len(word) - 2)
                    noised_word = word[:pos] + word[pos+1] + word[pos] + word[pos+2:]
                else:
                    noised_word = word # Can't swap a 1-letter word
            
            new_sentence_words.append(noised_word)
        
        else:
            # No noise applied, append the original word
            new_sentence_words.append(word)
            
    # Join the words back into a sentence
    return ' '.join(new_sentence_words)

# --- Now, let's test the "noising" function ---

print("--- Testing the 'add_noise_to_sentence' function ---\n")

# A list of the clean sentences from our previous test
test_sentences = [
    "waiting time tariff each period of one minute or part",
    "christian catholic relabelled as christian roman catholic for consistency with census labels",
    "per night per person",
    "mile south of the town which is included in the heritage walk vale trail",
    "at te times of the ipo in will have returned more than today",
    "different runners from over running clubs have already enjoyed the free to enter k hastings seafront promenade course"
]

for sentence in test_sentences:
    noised_sentence = add_noise_to_sentence(sentence)
    
    print(f"Clean (Input) : {sentence}")
    print(f"Noisy (Output): {noised_sentence}\n")



--- Testing the 'add_noise_to_sentence' function ---

Clean (Input) : waiting time tariff each period of one minute or part
Noisy (Output): waiting zime tariff each period of one minute or pdrt

Clean (Input) : christian catholic relabelled as christian roman catholic for consistency with census labels
Noisy (Output): christian catholic relabelled as christian roman catholic for consistency whth census labels

Clean (Input) : per night per person
Noisy (Output): per night per person

Clean (Input) : mile south of the town which is included in the heritage walk vale trail
Noisy (Output): mile south of the town which is incmluded in the heritage walk vale trail

Clean (Input) : at te times of the ipo in will have returned more than today
Noisy (Output): at te tmies of the ipo in will have returned more than todqy

Clean (Input) : different runners from over running clubs have already enjoyed the free to enter k hastings seafront promenade course
Noisy (Output): different runners from ove

In [5]:
###
# CELL 3: VERIFY THE FINAL PROCESSED FILES
###

import os

print("--- Verifying the output of 01_data_preprocessing.py ---")

# Define file paths (relative to this notebook in 'ai_model/notebooks/')
clean_file_path = os.path.join('..', 'data', 'train_clean.txt')
noisy_file_path = os.path.join('..', 'data', 'train_noisy.txt')

try:
    with open(clean_file_path, 'r', encoding='utf-8') as f_clean, \
         open(noisy_file_path, 'r', encoding='utf-8') as f_noisy:
        
        print(f"Comparing the first 10 lines of:\n  - {clean_file_path}\n  - {noisy_file_path}\n")
        
        for i in range(10):
            clean_line = f_clean.readline().strip()
            noisy_line = f_noisy.readline().strip()
            
            if not clean_line or not noisy_line:
                print("--- End of file (or file is empty) ---")
                break
            
            print(f"CLEAN (Y): {clean_line}")
            print(f"NOISY (X): {noisy_line}\n")
    
    print("--- Verification complete ---")

except FileNotFoundError:
    print(f"ERROR: Could not find the processed files.")
    print("Please make sure you ran 'python 01_data_preprocessing.py' successfully.")
except Exception as e:
    print(f"An error occurred: {e}")



--- Verifying the output of 01_data_preprocessing.py ---
Comparing the first 10 lines of:
  - ..\data\train_clean.txt
  - ..\data\train_noisy.txt

CLEAN (Y): waiting time tariff each period of one minute or part
NOISY (X): waiting time tariuff each period of one minut or part

CLEAN (Y): christian catholic relabelled as christian roman catholic for consistency with census labels
NOISY (X): christian catholic relabelled as christian rman catholic for cfnsistency with census labels

CLEAN (Y): per night per person
NOISY (X): per night per person

CLEAN (Y): mile south of the town which is included in the heritage walk vale trail
NOISY (X): mile south of the town which is includeyd in the heritage walk ale trial

CLEAN (Y): mile you will see the entrance to t cerrig woodland retreats on your left
NOISY (X): mile you wull see the entrance to t jcerrig woodland retreats on your left

CLEAN (Y): x setting bar with short temperature probe
NOISY (X): x setting bar with short temperature probe
