In [1]:
import re
import math
import random
#import gradio as gr
import numpy as np
from collections import defaultdict

In [2]:
class BasicLanguageModel:
    def __init__(self, n_params=5):
        random.seed(42)
        self.n_params = n_params
        self.state = [{} for _ in range(n_params)]
        self.train_data = None
        self.test_data = None
        self.num_train_tokens = 0
        print(f"Model initialized with n_params={n_params}")

    def tokenize(self, text):
        """Tokenize text into words and punctuation"""
        return re.findall(r"\b[a-zA-Z0-9]+\b|[.]", text.lower())
    
    def get_data(self):
        with open('data/weather_data.txt', 'r', encoding='utf-8') as file:
            corpus = file.read()
        tokens = self.tokenize(corpus)
        split_index = int(len(tokens) * 0.90)
        train_corpus = tokens[:split_index]
        test_corpus = tokens[split_index:]
        print(f"Data loaded: {len(tokens)} total tokens")
        print(f"Training: {len(train_corpus)} tokens")
        print(f"Testing: {len(test_corpus)} tokens")
        return train_corpus, test_corpus

In [13]:
# Create model instance
model = BasicLanguageModel(n_params=3)

for attr_name, attr_value in vars(model).items():
    print(f"{attr_name}: {attr_value}")
    
print("="*100)
    
# Load data
model.train_data, model.test_data = model.get_data()
model.num_train_tokens = len(model.train_data)

print(f"Training data sample: {model.train_data[:10]}")
print(f"Test data sample: {model.test_data[:5]}")

Model initialized with n_params=3
n_params: 3
state: [{}, {}, {}]
train_data: None
test_data: None
num_train_tokens: 0
Data loaded: 2445 total tokens
Training: 2200 tokens
Testing: 245 tokens
Training data sample: ['daily', 'weather', 'reports', 'for', 'major', 'indian', 'cities', 'mumbai', 'weather', 'report']
Test data sample: ['conditions', 'in', 'delhi', 'to', 'continue']


In [4]:
def train_model(model):
    tokens = model.train_data
    
    for ind in range(1, model.n_params + 1): # ind == 1 to 4 (ngram = 3)
        counts = model.state[ind - 1]
        ngram_count = 0
        
        for i in range(len(tokens) - ind + 1):
            context = tuple(tokens[i:i + ind - 1])
            next_token = tokens[i + ind - 1]
            
            if context not in counts:
                counts[context] = defaultdict(int)
            counts[context][next_token] += 1
            ngram_count += 1
        
        print(f"Trained {ind}-gram model: {ngram_count} n-grams processed")
        print(f"Unique contexts: {len(counts)}")

In [14]:
train_model(model)

Trained 1-gram model: 2200 n-grams processed
Unique contexts: 1
Trained 2-gram model: 2199 n-grams processed
Unique contexts: 150
Trained 3-gram model: 2198 n-grams processed
Unique contexts: 317


In [6]:
print("=== N-gram Model Statistics ===")

for n in range(1, model.n_params + 1):
    state = model.state[n-1]
    print(f"\n{n}-gram model:")
    print(f"  Total contexts: {len(state)}")
    
    # Show some examples
    if state:
        print("  Sample contexts and their next tokens:")
        for i, (context, next_tokens) in enumerate(list(state.items())[:3]):
            print(f"    {context} → {dict(next_tokens)}")

=== N-gram Model Statistics ===

1-gram model:
  Total contexts: 1
  Sample contexts and their next tokens:
    () → {'daily': 1, 'weather': 69, 'reports': 1, 'for': 43, 'major': 1, 'indian': 1, 'cities': 1, 'mumbai': 48, 'report': 42, 'monday': 6, 'the': 133, 'temperature': 43, 'in': 132, 'today': 86, 'is': 84, '32': 5, 'degrees': 43, 'celsius': 43, '.': 217, 'will': 43, 'experience': 42, 'high': 19, 'humidity': 21, 'expect': 44, 'light': 6, 'rainfall': 18, 'during': 16, 'evening': 6, 'hours': 4, 'wind': 42, 'speed': 42, '15': 4, 'kilometers': 42, 'per': 42, 'hour': 42, 'residents': 44, 'of': 43, 'should': 44, 'carry': 12, 'umbrellas': 12, 'delhi': 45, '38': 1, 'dry': 9, 'clear': 12, 'skies': 25, 'throughout': 25, 'day': 23, '10': 3, 'stay': 14, 'hydrated': 11, 'chennai': 42, '35': 3, 'partly': 12, 'cloudy': 12, 'afternoon': 9, '12': 5, 'use': 7, 'sunscreen': 7, 'kolkata': 42, '33': 5, 'moderate': 9, '14': 4, 'bengaluru': 42, '28': 3, 'pleasant': 11, '18': 3, 'enjoy': 6, 'hyderabad': 

In [None]:
def predict_next_token(model, context):
    """Predict the most likely next token given context"""
    for n in range(model.n_params, 1, -1):
        if len(context) >= n - 1:
            context_n = tuple(context[-(n - 1):])
            counts = model.state[n - 1].get(context_n)
            if counts:
                return max(counts.items(), key=lambda x: x[1])[0]
    
    # Fallback to unigram
    unigram_counts = model.state[0].get(())
    if unigram_counts:
        return max(unigram_counts.items(), key=lambda x: x[1])[0]
    return None

def generate_text(model, context, num_tokens=10):
    """Generate text given a starting context"""
    if isinstance(context, str):
        context = model.tokenize(context)
    
    generated = list(context)
    
    for i in range(num_tokens):
        next_token = predict_next_token(model, generated[-(model.n_params - 1):])
        if next_token is None:
            break
        generated.append(next_token)
    
    return " ".join(generated)

print("Text generation functions defined!")