# Notebook-2: HMM (Probabilistic Oracle) Training



### Mount Drive & Define Paths

In [1]:
import os
import json
import string
from google.colab import drive
from collections import defaultdict

# --- Mount Drive ---
drive.mount('/content/drive')

# --- Configuration ---
# This MUST be the same path as in your first notebook
BASE_PATH = '/content/drive/My Drive/ml-hackathon'

DATA_PATH = os.path.join(BASE_PATH, 'data')
MODEL_PATH = os.path.join(BASE_PATH, 'models')

# --- Input File (from Notebook 1) ---
CORPUS_JSON_PATH = os.path.join(DATA_PATH, 'corpus_by_length.json')

# --- Output File (Our "Trained HMM") ---
HMM_MODEL_PATH = os.path.join(MODEL_PATH, 'hmm_probabilities.json')

print(f"Base path set to: {BASE_PATH}")
print(f"Loading corpus from: {CORPUS_JSON_PATH}")
print(f"Will save trained model to: {HMM_MODEL_PATH}")

Mounted at /content/drive
Base path set to: /content/drive/My Drive/ml-hackathon
Loading corpus from: /content/drive/My Drive/ml-hackathon/data/corpus_by_length.json
Will save trained model to: /content/drive/My Drive/ml-hackathon/models/hmm_probabilities.json


### Load Processed Corpus

In [2]:
# Load the corpus grouped by length
try:
    with open(CORPUS_JSON_PATH, 'r') as f:
        corpus_by_length = json.load(f)
    print("Successfully loaded corpus_by_length.json.")

    # JSON keys are strings, convert them back to integers
    corpus_by_length = {int(k): v for k, v in corpus_by_length.items()}

    print(f"Loaded data for {len(corpus_by_length)} word lengths.")
except FileNotFoundError:
    print(f"ERROR: File not found at {CORPUS_JSON_PATH}")
    print("Please make sure Notebook 1 ran correctly and the file exists.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully loaded corpus_by_length.json.
Loaded data for 24 word lengths.


### HMM Probability Functions

In [4]:
def calculate_probabilities(word_list):
    """
    Calculates unigram and bigram probabilities for a given list of words.
    This acts as our simplified HMM.
    """
    # Use defaultdict for easy counting
    unigram_counts = defaultdict(int)
    bigram_counts = defaultdict(lambda: defaultdict(int))
    total_letters = 0

    # Use a set of all letters for smoothing
    alphabet = set(string.ascii_uppercase)

    # 1. Count occurrences
    for word in word_list:
        # Add a "start" token for the first letter's bigram
        prev_char = 'START'
        for char in word:
            unigram_counts[char] += 1
            bigram_counts[prev_char][char] += 1
            prev_char = char
            total_letters += 1

    # 2. Calculate Probabilities with Laplace (Add-1) Smoothing
    # Smoothing prevents zero-probabilities for unseen pairs

    # --- Unigram probabilities ---
    unigram_probs = {}
    total_unigram_denominator = total_letters + len(alphabet) # Add-1 smoothing
    for char in alphabet:
        unigram_probs[char] = (unigram_counts[char] + 1) / total_unigram_denominator

    # --- Bigram probabilities ---
    bigram_probs = {}
    # We need to calculate for 'START' token + all letters
    possible_prev_chars = list(alphabet) + ['START']

    for prev_char in possible_prev_chars:
        bigram_probs[prev_char] = {}
        total_bigram_denominator = sum(bigram_counts[prev_char].values()) + len(alphabet)

        for char in alphabet:
            bigram_probs[prev_char][char] = (bigram_counts[prev_char][char] + 1) / total_bigram_denominator

    return {'unigram': unigram_probs, 'bigram': bigram_probs}

### Train "HMM" for Each Word Length

In [5]:
print("Training HMM (Probability Models) for each word length...")

# This will store all our models: {5: model_for_5, 6: model_for_6, ...}
hmm_models = {}

# Get all lengths from our loaded corpus
word_lengths = sorted(corpus_by_length.keys())

for length in word_lengths:
    words = corpus_by_length[length]
    if len(words) > 0: # Only train if we have words of that length
        hmm_models[length] = calculate_probabilities(words)
        print(f" - Trained model for length {length} (based on {len(words)} words)")

print(f"\nTraining complete. Total models trained: {len(hmm_models)}")

# --- Save the combined model to Google Drive ---
try:
    with open(HMM_MODEL_PATH, 'w') as f:
        # We need to convert integer keys to strings for JSON
        json.dump({str(k): v for k, v in hmm_models.items()}, f)
    print(f"\nSuccessfully saved all HMM models to: {HMM_MODEL_PATH}")
except Exception as e:
    print(f"\nError saving model to Google Drive: {e}")

print("\nNotebook 2: HMM Model Training and Saving Complete.")

Training HMM (Probability Models) for each word length...
 - Trained model for length 1 (based on 46 words)
 - Trained model for length 2 (based on 84 words)
 - Trained model for length 3 (based on 388 words)
 - Trained model for length 4 (based on 1169 words)
 - Trained model for length 5 (based on 2340 words)
 - Trained model for length 6 (based on 3755 words)
 - Trained model for length 7 (based on 5111 words)
 - Trained model for length 8 (based on 6348 words)
 - Trained model for length 9 (based on 6787 words)
 - Trained model for length 10 (based on 6465 words)
 - Trained model for length 11 (based on 5452 words)
 - Trained model for length 12 (based on 4292 words)
 - Trained model for length 13 (based on 3094 words)
 - Trained model for length 14 (based on 2019 words)
 - Trained model for length 15 (based on 1226 words)
 - Trained model for length 16 (based on 698 words)
 - Trained model for length 17 (based on 375 words)
 - Trained model for length 18 (based on 174 words)
 - Tr