# Notebook 1: Data Preprocessing & Setup



### Mount Google Drive

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Libraries

In [17]:
import string
import pprint  # For clean dictionary printing
from google.colab import files
import io      # To handle file content
import os      # To create directories
import json    # To save our processed data

### Setup Project Directory

In [18]:
# --- Configuration ---
BASE_PATH = '/content/drive/My Drive/ml-hackathon'

# Define our project structure
DATA_PATH = os.path.join(BASE_PATH, 'data')
MODEL_PATH = os.path.join(BASE_PATH, 'models')
NOTEBOOK_PATH = os.path.join(BASE_PATH, 'notebooks')
REPORT_PATH = os.path.join(BASE_PATH, 'reports')

# Define paths for our processed files
CORPUS_JSON_PATH = os.path.join(DATA_PATH, 'corpus_by_length.json')
TEST_JSON_PATH = os.path.join(DATA_PATH, 'test_by_length.json')
CORPUS_SET_PATH = os.path.join(DATA_PATH, 'all_corpus_words_set.json')

# --- Create Directories ---
print(f"Base project path set to: {BASE_PATH}")
print("Creating project directories (if they don't exist)...")

os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(NOTEBOOK_PATH, exist_ok=True)
os.makedirs(REPORT_PATH, exist_ok=True)

print(f"Created: {DATA_PATH}")
print(f"Created: {MODEL_PATH}")
print(f"Created: {NOTEBOOK_PATH}")
print(f"Created: {REPORT_PATH}")
print("\nDirectory setup complete.")

Base project path set to: /content/drive/My Drive/ml-hackathon
Creating project directories (if they don't exist)...
Created: /content/drive/My Drive/ml-hackathon/data
Created: /content/drive/My Drive/ml-hackathon/models
Created: /content/drive/My Drive/ml-hackathon/notebooks
Created: /content/drive/My Drive/ml-hackathon/reports

Directory setup complete.


### Upload Raw Data

In [19]:
print("Please upload 'corpus.txt' and 'test.txt'")
uploaded_files = files.upload()

# --- Robustly find and store content ---
corpus_content = None
test_content = None

# Loop through all uploaded files and find the right ones
for filename, content in uploaded_files.items():
    if 'corpus' in filename.lower():
        corpus_content = content.decode('utf-8')
        print(f"Found and decoded corpus file: {filename}")
    elif 'test' in filename.lower():
        test_content = content.decode('utf-8')
        print(f"Found and decoded test file: {filename}")

# --- Check if we got both ---
if corpus_content and test_content:
    print("\nSuccessfully uploaded and decoded both files.")
else:
    print("\nError: One or both files were not found. Please re-run this cell and upload 'corpus.txt' and 'test.txt'.")

Please upload 'corpus.txt' and 'test.txt'


Saving corpus.txt to corpus (7).txt
Saving test.txt to test (6).txt
Found and decoded corpus file: corpus (7).txt
Found and decoded test file: test (6).txt

Successfully uploaded and decoded both files.


### Helper Functions

In [20]:
def load_words_from_content(content):
    """Reads file content, returning a list of cleaned, uppercase words."""
    words = []
    # .splitlines() splits the content by new lines
    for line in content.splitlines():
        # .strip() removes whitespace
        # .upper() converts to uppercase
        word = line.strip().upper()

        # Ensure word is not empty and contains only standard A-Z letters
        if word and all(c in string.ascii_uppercase for c in word):
            words.append(word)
    return words

def group_by_length(words):
    """Groups a list of words into a dictionary by their length."""
    words_by_length = {}
    for word in words:
        length = len(word)
        if length not in words_by_length:
            words_by_length[length] = []
        words_by_length[length].append(word)
    return words_by_length

### Process Data and Save to Drive

In [21]:
print("Processing and structuring data...")

# 1. Process the training corpus
corpus_words = load_words_from_content(corpus_content)
corpus_by_length = group_by_length(corpus_words)
all_corpus_words_set = set(corpus_words)
print(f"Successfully processed {len(corpus_words)} corpus words.")

# 2. Process the test set
test_words = load_words_from_content(test_content)
test_by_length = group_by_length(test_words)
print(f"Successfully processed {len(test_words)} test words.")

# 3. Save processed files to Google Drive
print("\nSaving processed files to Google Drive...")

# Save corpus grouped by length
with open(CORPUS_JSON_PATH, 'w') as f:
    json.dump(corpus_by_length, f)
print(f"Saved corpus (grouped by length) to: {CORPUS_JSON_PATH}")

# Save test set grouped by length
with open(TEST_JSON_PATH, 'w') as f:
    json.dump(test_by_length, f)
print(f"Saved test set (grouped by length) to: {TEST_JSON_PATH}")

# Save the full corpus word set (as a list for JSON)
with open(CORPUS_SET_PATH, 'w') as f:
    json.dump(list(all_corpus_words_set), f)
print(f"Saved full corpus word set to: {CORPUS_SET_PATH}")

print("\n--- Corpus Summary (Words per Length) ---")
corpus_lengths = {length: len(words) for length, words in corpus_by_length.items()}
pprint.pprint(corpus_lengths)

print("\nStep 2: Preprocessing and Saving Complete.")

Processing and structuring data...
Successfully processed 49979 corpus words.
Successfully processed 2000 test words.

Saving processed files to Google Drive...
Saved corpus (grouped by length) to: /content/drive/My Drive/ml-hackathon/data/corpus_by_length.json
Saved test set (grouped by length) to: /content/drive/My Drive/ml-hackathon/data/test_by_length.json
Saved full corpus word set to: /content/drive/My Drive/ml-hackathon/data/all_corpus_words_set.json

--- Corpus Summary (Words per Length) ---
{1: 46,
 2: 84,
 3: 388,
 4: 1169,
 5: 2340,
 6: 3755,
 7: 5111,
 8: 6348,
 9: 6787,
 10: 6465,
 11: 5452,
 12: 4292,
 13: 3094,
 14: 2019,
 15: 1226,
 16: 698,
 17: 375,
 18: 174,
 19: 88,
 20: 40,
 21: 16,
 22: 8,
 23: 3,
 24: 1}

Step 2: Preprocessing and Saving Complete.


### Save Sorted Unique Word List

In [22]:
print("\nCreating sorted unique word list...")

# Define the new output path
SORTED_WORDS_PATH = os.path.join(DATA_PATH, 'corpus_sorted_unique.json') # <-- Output for new cell

try:
    # 1. Load the unique word set file
    with open(CORPUS_SET_PATH, 'r') as f:
        # We must load the data from the 'all_corpus_words_set.json' file
        unique_words = json.load(f)
    print(f"Loaded {len(unique_words)} unique words from {CORPUS_SET_PATH}")

    # 2. Sort the list alphabetically
    unique_words.sort()
    print("Alphabetically sorted all unique words.")

    # 3. Save the new sorted list to Google Drive
    with open(SORTED_WORDS_PATH, 'w') as f:
        json.dump(unique_words, f)
    print(f"Successfully saved sorted list to: {SORTED_WORDS_PATH}")

    print("\n--- First 50 words of sorted list ---")
    print(unique_words[:50])

except NameError:
    print("Error: Make sure 'DATA_PATH' and 'CORPUS_SET_PATH' are defined.")
    print("Please re-run Cell 3 and Cell 6 first.")
except FileNotFoundError:
    print(f"ERROR: File not found at {CORPUS_SET_PATH}")
    print("Please make sure Cell 6 ran correctly.")
except Exception as e:
    print(f"An error occurred: {e}")

print("\nNotebook 1, Step 3: Sorted Word List Generation Complete.")


Creating sorted unique word list...
Loaded 49397 unique words from /content/drive/My Drive/ml-hackathon/data/all_corpus_words_set.json
Alphabetically sorted all unique words.
Successfully saved sorted list to: /content/drive/My Drive/ml-hackathon/data/corpus_sorted_unique.json

--- First 50 words of sorted list ---
['A', 'AARON', 'AARONIC', 'AARONITIC', 'ABA', 'ABACUS', 'ABAFF', 'ABALIENATION', 'ABANDONER', 'ABARIS', 'ABARTHROSIS', 'ABAS', 'ABASER', 'ABASHEDNESS', 'ABASSIN', 'ABATABLE', 'ABATISED', 'ABATTOIR', 'ABBASSIDE', 'ABBEYSTEDE', 'ABBIE', 'ABBOTCY', 'ABBOTNULLIUS', 'ABBOTSHIP', 'ABBREVIATOR', 'ABDERIAN', 'ABDEST', 'ABDICANT', 'ABDICATE', 'ABDOMINOHYSTERECTOMY', 'ABDOMINOPOSTERIOR', 'ABDUCE', 'ABDUCT', 'ABDUCTION', 'ABEARANCE', 'ABED', 'ABEL', 'ABELITE', 'ABELMOSCHUS', 'ABENCERRAGES', 'ABERDONIAN', 'ABERIA', 'ABERRANT', 'ABERRATOR', 'ABERROSCOPE', 'ABERUNCATOR', 'ABET', 'ABEVACUATION', 'ABHISEKA', 'ABHOMINABLE']

Notebook 1, Step 3: Sorted Word List Generation Complete.
