In [5]:
!pip install matplotlib
!pip install torch
!pip install tqdm
!pip install scikit-learn
!pip install nltk
!pip install tensorboard
!pip install torchsummaryy

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.2 MB)
Using cached scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0
[31mERROR: Could not find a version that satisfies the requirement torchsummaryy (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torchsummaryy[0m[31m
[0m

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch.nn.functional as F
from conlleval import evaluate as conllevaluate
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
import time
from torch.utils.tensorboard import SummaryWriter

In [7]:
def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, to_ix['<UNK>']) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


# Hamming distance calculation
def hamming_distance(y_pred, y_gold):
    """Calculate hamming distance between predicted and gold sequences"""
    return sum(y1 != y2 for y1, y2 in zip(y_pred, y_gold))


In [8]:
class BiLSTM_CRF(nn.Module): 

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_embedding_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        self.hidden = self.init_hidden()
        self.char_embed = nn.Embedding(10, char_embedding_dim)
        self.char_cnn = nn.Conv2d(in_channels=1, out_channels=char_embedding_dim, kernel_size=(1, char_embedding_dim))

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        forward_var = init_alphas
        device_info = forward_var.device 
        for feat in feats:
            alphas_t = []  
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size).to(device_info)
                trans_score = self.transitions[next_tag].view(1, -1).to(device_info)
                next_tag_var = forward_var + trans_score + emit_score
         
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]].to(device_info)
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence): 
        self.hidden = self.init_hidden()
        sentence = sentence.to(self.word_embeds.weight.device)
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        self.hidden = tuple(h.to(embeds.device) for h in self.hidden)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden) 
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def get_char_indices(self, word_idx):
        """
        Extracts character indices using nltk.word_tokenize.
        """
        char_idx = [word_to_idx[char] for char in train_data[word_idx]['tokens']] 
        return char_idx

    def _get_lstm_features_cnn(self, sentence):
        self.hidden = self.init_hidden()

        sentence = sentence.to(self.word_embeds.weight.device)
        char_embeddings, char_ids = self.char_embed, []
        for word_idx in sentence:
           
            chars = self.get_char_indices(word_idx) 
            char_ids.append(torch.tensor(chars).to(device))
                            
        char_ids = pad_sequence(char_ids, batch_first=True, padding_value=0) 

        self.conv1 = nn.Conv1d(in_channels=char_embeddings.num_embeddings,  out_channels=11,  kernel_size=3, padding=1)  
        cnn_out = self.conv1(char_embeddings(char_ids))
        lstm_out = torch.max(F.relu(cnn_out), dim=2)[0] 
        lstm_out = lstm_out.view(len(sentence), -1)
        lstm_out, self.hidden = self.lstm(lstm_out, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim * 2)
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1, device=feats.device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long, device=feats.device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats, gold_tags=None, cost_scale=0):
        """
        Run Viterbi algorithm to get the best tag sequence.
        With cost_scale > 0, it performs cost-augmented inference.
        """
        backpointers = []

        init_vvars = torch.full((1, self.tagset_size), -10000., device=feats.device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        forward_var = init_vvars
        for i, feat in enumerate(feats):
            bptrs_t = [] 
            viterbivars_t = []  

            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transitions[next_tag]
                
                # Cost-augmented decoding: add cost when gold and predicted tags differ
                if cost_scale > 0 and gold_tags is not None and i < len(gold_tags):
                    cost = torch.zeros(self.tagset_size, device=feats.device)
                    cost[gold_tags[i]] = 0  # No cost for correct tag
                    # 10x cost (hamming distance) for incorrect tags
                    mask = torch.ones(self.tagset_size, device=feats.device)
                    mask[gold_tags[i]] = 0
                    cost = cost_scale * mask  # Apply cost to incorrect tags
                    next_tag_var = next_tag_var + cost
                
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)

        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        """Standard negative log likelihood loss for CRF"""
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def structured_svm_loss(self, sentence, tags, cost_scale=10.0):
        """
        Structured SVM loss with cost-augmented decoding.
        L(x,y,w) = max_{y'} [score(x,y',w) + cost(y,y')] - score(x,y,w)
        """
        feats = self._get_lstm_features(sentence)
        gold_score = self._score_sentence(feats, tags)
        
        # Cost-augmented decoding
        cost_augmented_score, predicted_tags = self._viterbi_decode(feats, tags, cost_scale)
        
        # Margin-based loss: max(0, cost_augmented_score - gold_score + hamming_distance*cost_scale)
        margin = cost_augmented_score - gold_score
        
        return margin if margin > 0 else torch.zeros_like(margin)

    def forward(self, sentence):  
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

def make_data_point(sent):
    """
        Creates a dictionary from String to an Array of Strings representing the data.  
    """
    dic = {}
    sent = [s.strip().split() for s in sent]
    dic['tokens'] = ['<START>'] + [s[0] for s in sent] + ['<STOP>']
    dic['pos'] = ['<START>'] + [s[1] for s in sent] + ['<STOP>']
    dic['NP_chunk'] = ['<START>'] + [s[2] for s in sent] + ['<STOP>']
    dic['gold_tags'] = ['<START>'] + [s[3] for s in sent] + ['<STOP>']
    return dic

def read_data(filename):
    """
    Reads the CoNLL 2003 data into an array of dictionaries (a dictionary for each data point).
    """
    data = []
    with open(filename, 'r') as f:
        sent = []
        for line in f.readlines():
            if line.strip():
                sent.append(line)
            else:
                data.append(make_data_point(sent))
                sent = []
        data.append(make_data_point(sent))

    return data

def compute_metrics(predicted_tags, gold_tags):
    """
    Compute precision, recall and F1 score
    """
    # Filter out <START> and <STOP> tags
    filtered_pred = []
    filtered_gold = []
    
    for pred_seq, gold_seq in zip(predicted_tags, gold_tags):
        # Remove <START> and <STOP> tags
        filtered_pred.extend([p for p in pred_seq if p not in [tag_2_idx[START_TAG], tag_2_idx[STOP_TAG]]])
        filtered_gold.extend([g for g in gold_seq if g not in [tag_2_idx[START_TAG], tag_2_idx[STOP_TAG]]])
    
    # Convert indices to tag names for better interpretation
    idx_to_tag = {v: k for k, v in tag_2_idx.items()}
    pred_tags = [idx_to_tag[i] for i in filtered_pred]
    gold_tags = [idx_to_tag[i] for i in filtered_gold]
    
    # Calculate metrics - exclude 'O' tag or special tags from evaluation if needed
    labels = [k for k in tag_2_idx.keys() if k not in [START_TAG, STOP_TAG, 'O']]
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        gold_tags, pred_tags, labels=labels, average='micro')
    
    return precision, recall, f1

def evaluate_model(model, data, word_2_idx, tag_2_idx):
    """
    Evaluate model on data and return predictions and metrics
    """
    model.eval()
    predictions = []
    gold_standards = []
    
    with torch.no_grad():
        for example in data:
            sentence = prepare_sequence(example['tokens'], word_2_idx).to(device)
            gold_tags = [tag_2_idx[t] for t in example['gold_tags']]
            gold_standards.append(gold_tags)
            
            _, predicted_tags = model(sentence)
            predictions.append(predicted_tags)
    
    precision, recall, f1 = compute_metrics(predictions, gold_standards)
    return predictions, precision, recall, f1

# Generate output predictions in the required format
def generate_output_file(predictions, data, tag_key_list, filename):
    with open(filename, 'w') as f:
        for pred_tags, example in zip(predictions, data):
            # Convert tag indices to tag names and filter out <START> and <STOP>
            tag_names = [tag_key_list[tag_id] for tag_id in pred_tags 
                         if tag_key_list[tag_id] not in [START_TAG, STOP_TAG]]
            
            # Make sure we have the same number of tokens and tags
            tokens = [t for t in example['tokens'] if t not in [START_TAG, STOP_TAG]]
            
            # Write output in CoNLL format
            for token, tag in zip(tokens, tag_names):
                f.write(f"{token} {tag}\n")
            f.write("\n")  # Empty line between sentences

def generate_minibatches(training_data, batch_size):
    minibatches = []
    for i in range(0, len(training_data), batch_size):
        minibatch = training_data[i:i + batch_size]
        minibatches.append(minibatch)
    return minibatches

# Constants
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 40
HIDDEN_DIM = 40
CHAR_EMBEDDING_DIM = 4

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
dev_data = read_data('ner.dev')
test_data = read_data('ner.test')
train_data = read_data('ner.train')
print(f"Train size: {len(train_data)}, Dev size: {len(dev_data)}, Test size: {len(test_data)}")

# Create dictionaries
word_2_idx = {}
# Add special token for unknown words
word_2_idx['<UNK>'] = 0

for sentence in train_data + dev_data + test_data:
    for word in sentence['tokens']:
        if word not in word_2_idx:
            word_2_idx[word] = len(word_2_idx)

tag_2_idx = {}
for sentence in train_data + dev_data + test_data:
    for word in sentence['gold_tags']:
        if word not in tag_2_idx:
            tag_2_idx[word] = len(tag_2_idx)

print(f"Vocabulary size: {len(word_2_idx)}, Tag set size: {len(tag_2_idx)}")

# Initialize tensorboard writer
writer = SummaryWriter('runs/structured_svm')

# Function to train with structured SVM and early stopping
def train_structured_svm(model, train_data, dev_data, learning_rates, reg_strengths, 
                         cost_scale=10.0, batch_size=32, max_epochs=30, patience=3):
    """
    Train a model using structured SVM with early stopping
    
    Args:
        model: BiLSTM_CRF model
        train_data: Training data
        dev_data: Development data for early stopping
        learning_rates: List of learning rates to try
        reg_strengths: List of regularization strengths to try
        cost_scale: Scaling factor for hamming distance cost
        batch_size: Batch size for training
        max_epochs: Maximum number of epochs
        patience: Number of epochs to wait for improvement before stopping
    
    Returns:
        best_model: Model with best F1 score
        best_lr: Best learning rate
        best_reg: Best regularization strength
        best_f1: Best F1 score
        results: Dictionary of results for each combination
    """
    results = {}
    best_f1 = 0
    best_model = None
    best_lr = None
    best_reg = None
    
    # Try different combinations of learning rates and regularization strengths
    for lr in learning_rates:
        for reg_strength in reg_strengths:
            print(f"\n=== Training with lr={lr}, reg_strength={reg_strength} ===")
            
            # Initialize a new model for each combination
            current_model = BiLSTM_CRF(len(word_2_idx), tag_2_idx, EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM)
            current_model.to(device)
            
            # Setup optimizer
            optimizer = optim.SGD(current_model.parameters(), lr=lr)
            
            # Variables for early stopping
            best_dev_f1 = 0
            epochs_no_improve = 0
            epoch_results = []
            
            for epoch in range(max_epochs):
                # Training
                current_model.train()
                total_loss = 0
                minibatches = generate_minibatches(train_data, batch_size)
                
                for i, minibatch in tqdm(enumerate(minibatches), total=len(minibatches), 
                                          desc=f"Epoch {epoch+1}/{max_epochs}"):
                    current_model.zero_grad()
                    
                    # Calculate loss for each sentence in the batch
                    batch_loss = 0
                    for sentence in minibatch:
                        sentence_in = prepare_sequence(sentence['tokens'], word_2_idx).to(device)
                        tags = torch.tensor([tag_2_idx[t] for t in sentence['gold_tags']], 
                                            dtype=torch.long).to(device)
                        
                        # Structured SVM loss
                        loss = current_model.structured_svm_loss(sentence_in, tags, cost_scale)
                        batch_loss += loss
                    
                    # Average loss over batch
                    batch_loss = batch_loss / len(minibatch)
                    
                    # Add L2 regularization
                    l2_reg = 0.0
                    for param in current_model.parameters():
                        l2_reg += torch.norm(param, 2)
                    batch_loss += reg_strength * l2_reg
                    
                    # Backpropagation
                    batch_loss.backward()
                    optimizer.step()
                    
                    total_loss += batch_loss.item()
                    
                    # Log training loss
                    writer.add_scalar(f'Loss/train_lr{lr}_reg{reg_strength}', 
                                      batch_loss.item(), 
                                      epoch * len(minibatches) + i)
                
                avg_loss = total_loss / len(minibatches)
                print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")
                
                # Evaluate on dev set
                _, dev_precision, dev_recall, dev_f1 = evaluate_model(
                    current_model, dev_data, word_2_idx, tag_2_idx)
                
                print(f"Dev - Precision: {dev_precision:.4f}, Recall: {dev_recall:.4f}, F1: {dev_f1:.4f}")
                
                # Log dev metrics
                writer.add_scalar(f'Precision/dev_lr{lr}_reg{reg_strength}', dev_precision, epoch)
                writer.add_scalar(f'Recall/dev_lr{lr}_reg{reg_strength}', dev_recall, epoch)
                writer.add_scalar(f'F1/dev_lr{lr}_reg{reg_strength}', dev_f1, epoch)
                
                # Save epoch results
                epoch_results.append({
                    'epoch': epoch + 1,
                    'loss': avg_loss,
                    'dev_precision': dev_precision,
                    'dev_recall': dev_recall,
                    'dev_f1': dev_f1
                })
                
                # Check for improvement
                if dev_f1 > best_dev_f1:
                    best_dev_f1 = dev_f1
                    epochs_no_improve = 0
                    # Save model
                    torch.save(current_model.state_dict(), 
                               f'bilstm_crf_svm_lr{lr}_reg{reg_strength}_epoch{epoch+1}.pth')
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve >= patience:
                        print(f"Early stopping at epoch {epoch+1}")
                        break
            
            # Record results for this configuration
            results[(lr, reg_strength)] = {
                'best_epoch': epoch_results[epoch - epochs_no_improve]['epoch'],
                'best_precision': epoch_results[epoch - epochs_no_improve]['dev_precision'],
                'best_recall': epoch_results[epoch - epochs_no_improve]['dev_recall'],
                'best_f1': epoch_results[epoch - epochs_no_improve]['dev_f1'],
                'epoch_results': epoch_results
            }
            
            # Update overall best model if this configuration is better
            if best_dev_f1 > best_f1:
                best_f1 = best_dev_f1
                best_lr = lr
                best_reg = reg_strength
                
                # Load the best model from this configuration
                best_model = BiLSTM_CRF(len(word_2_idx), tag_2_idx, EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM)
                best_model.load_state_dict(torch.load(
                    f'bilstm_crf_svm_lr{lr}_reg{reg_strength}_epoch{epoch_results[epoch - epochs_no_improve]["epoch"]}.pth'))
                best_model.to(device)
    
    return best_model, best_lr, best_reg, best_f1, results

# Function to plot results
def plot_results(results, learning_rates, reg_strengths):
    """
    Plot results of parameter tuning
    """
    # Create a matrix of F1 scores
    f1_matrix = np.zeros((len(learning_rates), len(reg_strengths)))
    
    for i, lr in enumerate(learning_rates):
        for j, reg in enumerate(reg_strengths):
            f1_matrix[i, j] = results.get((lr, reg), {}).get('best_f1', 0)
    
    plt.figure(figsize=(10, 8))
    plt.imshow(f1_matrix, interpolation='nearest', cmap='viridis')
    plt.title('F1 Score by Learning Rate and Regularization Strength')
    plt.xlabel('Regularization Strength')
    plt.ylabel('Learning Rate')
    
    # Add colorbar
    plt.colorbar(label='F1 Score')
    
    # Add axis labels
    plt.xticks(np.arange(len(reg_strengths)), reg_strengths)
    plt.yticks(np.arange(len(learning_rates)), learning_rates)
    
    # Add text annotations
    for i in range(len(learning_rates)):
        for j in range(len(reg_strengths)):
            plt.text(j, i, f'{f1_matrix[i, j]:.3f}', 
                     ha="center", va="center", color="white" if f1_matrix[i, j] < 0.5 else "black")
    
    plt.tight_layout()
    plt.savefig('parameter_tuning_results.png')
    plt.close()
    
    # Plot precision, recall, and F1 for best configuration
    best_lr, best_reg = max(results.keys(), key=lambda k: results[k]['best_f1'])
    epoch_results = results[(best_lr, best_reg)]['epoch_results']
    
    epochs = [r['epoch'] for r in epoch_results]
    precision = [r['dev_precision'] for r in epoch_results]
    recall = [r['dev_recall'] for r in epoch_results]
    f1 = [r['dev_f1'] for r in epoch_results]
    loss = [r['loss'] for r in epoch_results]
    
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, precision, marker='o', label='Precision')
    plt.plot(epochs, recall, marker='s', label='Recall')
    plt.plot(epochs, f1, marker='^', label='F1')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.title(f'Metrics (lr={best_lr}, reg={best_reg})')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, marker='o', color='red')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('best_model_training_curves.png')
    plt.close()



Using device: cuda
Train size: 14987, Dev size: 3466, Test size: 3684
Vocabulary size: 30293, Tag set size: 10


In [11]:
# Main execution
if __name__ == "__main__":
    # Parameters to tune
    learning_rates = [0.001]
    reg_strengths = [0.01, 0.001, 0.05]
    
    # Train model with structured SVM and early stopping
    best_model, best_lr, best_reg, best_f1, results = train_structured_svm(
        None, train_data, dev_data, 
        learning_rates, reg_strengths, 
        cost_scale=10.0, batch_size=512, 
        max_epochs=5, patience=2
    )
    
    # Print tuning results
    print("\nParameter Tuning Results:")
    for (lr, reg), result in results.items():
        print(f"lr={lr}, reg={reg}: Precision={result['best_precision']:.4f}, "
              f"Recall={result['best_recall']:.4f}, F1={result['best_f1']:.4f}")
    
    print(f"\nBest Configuration: lr={best_lr}, reg={best_reg}, F1={best_f1:.4f}")
    
    # Plot results
    plot_results(results, learning_rates, reg_strengths)
    
    # Evaluate best model on test set
    print("\nEvaluating best model on test set...")
    test_preds, test_precision, test_recall, test_f1 = evaluate_model(
        best_model, test_data, word_2_idx, tag_2_idx)
    
    print(f"Test Results: Precision={test_precision:.4f}, Recall={test_recall:.4f}, F1={test_f1:.4f}")
    
    # Generate output files
    tag_key_list = list(tag_2_idx.keys())
    
    # Get predictions for dev set
    dev_preds, _, _, _ = evaluate_model(best_model, dev_data, word_2_idx, tag_2_idx)
    
    # Generate output files
    generate_output_file(dev_preds, dev_data, tag_key_list, 'dev_predictions_svm.txt')
    generate_output_file(test_preds, test_data, tag_key_list, 'test_predictions_svm.txt')
    
    # Save best model
    torch.save(best_model.state_dict(), 'best_bilstm_crf_svm_model.pth')
    
    # Generate report
    with open('structured_svm_report.txt', 'w') as f:
        f.write("# Structured SVM Training Report\n\n")
        
        f.write("## Model Description\n")
        f.write("The model is a BiLSTM-CRF with structured SVM training using a Hamming distance cost function.\n")
        f.write(f"- Embedding dimension: {EMBEDDING_DIM}\n")
        f.write(f"- Hidden dimension: {HIDDEN_DIM}\n")
        f.write(f"- Cost scale: 10.0 (multiplier for Hamming distance)\n\n")
        
        f.write("## Training Details\n")
        f.write("- Implemented early stopping based on F1 score on dev set\n")
        f.write("- Used L2 regularization\n")
        f.write("- Cost-augmented decoding during training\n\n")
        
        f.write("## Parameter Tuning Results\n")
        f.write("| Learning Rate | Reg Strength | Precision | Recall | F1 |\n")
        f.write("|---------------|--------------|-----------|--------|----|\n")
        
        for (lr, reg), result in results.items():
            f.write(f"| {lr} | {reg} | {result['best_precision']:.4f} | {result['best_recall']:.4f} | {result['best_f1']:.4f} |\n")
        
        f.write(f"\nBest configuration: lr={best_lr}, reg={best_reg}, F1={best_f1:.4f}\n\n")
        
        f.write("## Final Results\n")
        f.write(f"Dev set: Precision={best_f1:.4f}, Recall={results[(best_lr, best_reg)]['best_recall']:.4f}, F1={results[(best_lr, best_reg)]['best_f1']:.4f}\n")
        f.write(f"Test set: Precision={test_precision:.4f}, Recall={test_recall:.4f}, F1={test_f1:.4f}\n")
        
    print("Done! See structured_svm_report.txt for complete results.")


=== Training with lr=0.001, reg_strength=0.01 ===


Epoch 1/5:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1/5: 100%|██████████| 30/30 [12:44<00:00, 25.48s/it]


Epoch 1, Average Loss: 20612.8199


ValueError: Found input variables with inconsistent numbers of samples: [51578, 58510]

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
import time
from torch.utils.tensorboard import SummaryWriter

def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, to_ix.get('<UNK>', 0)) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


# Hamming distance calculation
def hamming_distance(y_pred, y_gold):
    """Calculate hamming distance between predicted and gold sequences"""
    return sum(y1 != y2 for y1, y2 in zip(y_pred, y_gold))


class BiLSTM_CRF(nn.Module): 

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_embedding_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        self.hidden = self.init_hidden()
        self.char_embed = nn.Embedding(10, char_embedding_dim)
        self.char_cnn = nn.Conv2d(in_channels=1, out_channels=char_embedding_dim, kernel_size=(1, char_embedding_dim))

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        forward_var = init_alphas
        device_info = feats.device 
        forward_var = forward_var.to(device_info)
        
        for feat in feats:
            alphas_t = []  
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                trans_score = self.transitions[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score
         
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence): 
        self.hidden = self.init_hidden()
        sentence = sentence.to(self.word_embeds.weight.device)
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        self.hidden = tuple(h.to(embeds.device) for h in self.hidden)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden) 
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1, device=feats.device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long, device=feats.device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats, gold_tags=None, cost_scale=0):
        """
        Run Viterbi algorithm to get the best tag sequence.
        With cost_scale > 0, it performs cost-augmented inference.
        """
        backpointers = []

        init_vvars = torch.full((1, self.tagset_size), -10000., device=feats.device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        forward_var = init_vvars
        for i, feat in enumerate(feats):
            bptrs_t = [] 
            viterbivars_t = []  

            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transitions[next_tag]
                
                # Cost-augmented decoding: add cost when gold and predicted tags differ
                if cost_scale > 0 and gold_tags is not None and i < len(gold_tags):
                    cost = torch.zeros(self.tagset_size, device=feats.device)
                    # No cost for correct tag
                    # 10x cost (hamming distance) for incorrect tags
                    mask = torch.ones(self.tagset_size, device=feats.device)
                    mask[gold_tags[i]] = 0
                    cost = cost_scale * mask  # Apply cost to incorrect tags
                    next_tag_var = next_tag_var + cost
                
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)

        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def structured_svm_loss(self, sentence, tags, cost_scale=10.0):
        """
        Structured SVM loss with cost-augmented decoding.
        L(x,y,w) = max_{y'} [score(x,y',w) + cost(y,y')] - score(x,y,w)
        """
        feats = self._get_lstm_features(sentence)
        gold_score = self._score_sentence(feats, tags)
        
        # Convert tags tensor to list for cost-augmented decoding
        gold_tags_list = tags.cpu().tolist()
        
        # Cost-augmented decoding
        cost_augmented_score, predicted_tags = self._viterbi_decode(feats, gold_tags_list, cost_scale)
        
        # Margin-based loss
        margin = cost_augmented_score - gold_score
        
        return margin if margin > 0 else torch.zeros_like(margin)

    def forward(self, sentence):  
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

def make_data_point(sent):
    """
        Creates a dictionary from String to an Array of Strings representing the data.  
    """
    dic = {}
    sent = [s.strip().split() for s in sent]
    dic['tokens'] = ['<START>'] + [s[0] for s in sent] + ['<STOP>']
    dic['pos'] = ['<START>'] + [s[1] for s in sent] + ['<STOP>']
    dic['NP_chunk'] = ['<START>'] + [s[2] for s in sent] + ['<STOP>']
    dic['gold_tags'] = ['<START>'] + [s[3] for s in sent] + ['<STOP>']
    return dic

def read_data(filename):
    """
    Reads the CoNLL 2003 data into an array of dictionaries (a dictionary for each data point).
    """
    data = []
    with open(filename, 'r') as f:
        sent = []
        for line in f.readlines():
            if line.strip():
                sent.append(line)
            else:
                data.append(make_data_point(sent))
                sent = []
        data.append(make_data_point(sent))

    return data

def compute_metrics(predictions, gold_standards, tag_2_idx):
    """
    Compute precision, recall and F1 score
    
    This version handles different lengths safely by truncating or padding as needed
    """
    idx_to_tag = {v: k for k, v in tag_2_idx.items()}
    
    filtered_pred = []
    filtered_gold = []
    
    # Handle different sequence lengths appropriately
    for pred_seq, gold_seq in zip(predictions, gold_standards):
        # Filter out special tags and convert to actual tag names
        pred_tags = [idx_to_tag[idx] for idx in pred_seq 
                    if idx_to_tag[idx] not in [START_TAG, STOP_TAG]]
        gold_tags = [idx_to_tag[idx] for idx in gold_seq 
                    if idx_to_tag[idx] not in [START_TAG, STOP_TAG]]
        
        # Handle different lengths: truncate to the shorter length
        min_len = min(len(pred_tags), len(gold_tags))
        pred_tags = pred_tags[:min_len]
        gold_tags = gold_tags[:min_len]
        
        filtered_pred.extend(pred_tags)
        filtered_gold.extend(gold_tags)
    
    # Calculate metrics
    labels = [k for k in tag_2_idx.keys() if k not in [START_TAG, STOP_TAG, 'O']]
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        filtered_gold, filtered_pred, labels=labels, average='micro', zero_division=0)
    
    return precision, recall, f1

def evaluate_model(model, data, word_2_idx, tag_2_idx):
    """
    Evaluate model on data and return predictions and metrics
    """
    model.eval()
    predictions = []
    gold_standards = []
    
    with torch.no_grad():
        for example in tqdm(data, desc="Evaluating"):
            # Prepare sentence and gold tags
            sentence = prepare_sequence(example['tokens'], word_2_idx).to(next(model.parameters()).device)
            gold_tags = [tag_2_idx[t] for t in example['gold_tags']]
            gold_standards.append(gold_tags)
            
            # Get predictions
            _, predicted_tags = model(sentence)
            predictions.append(predicted_tags)
    
    precision, recall, f1 = compute_metrics(predictions, gold_standards, tag_2_idx)
    return predictions, precision, recall, f1

def generate_output_file(predictions, data, tag_2_idx, filename):
    """
    Generate output file in the required format
    """
    idx_to_tag = {v: k for k, v in tag_2_idx.items()}
    
    with open(filename, 'w') as f:
        for pred_tags, example in zip(predictions, data):
            # Convert tag indices to tag names
            tag_names = [idx_to_tag[tag_id] for tag_id in pred_tags 
                         if idx_to_tag[tag_id] not in [START_TAG, STOP_TAG]]
            
            # Get tokens (excluding special tokens)
            tokens = [t for t in example['tokens'] if t not in [START_TAG, STOP_TAG]]
            
            # Ensure lengths match by truncating to shorter length
            min_len = min(len(tokens), len(tag_names))
            tokens = tokens[:min_len]
            tag_names = tag_names[:min_len]
            
            # Write in the required format
            for token, tag in zip(tokens, tag_names):
                f.write(f"{token} {tag}\n")
            f.write("\n")  # Empty line between sentences

def generate_minibatches(training_data, batch_size):
    """
    Generate minibatches for training
    """
    # Shuffle data first for better training
    indices = list(range(len(training_data)))
    np.random.shuffle(indices)
    shuffled_data = [training_data[i] for i in indices]
    
    minibatches = []
    for i in range(0, len(shuffled_data), batch_size):
        minibatch = shuffled_data[i:i + batch_size]
        minibatches.append(minibatch)
    return minibatches

# Constants
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 40
HIDDEN_DIM = 40
CHAR_EMBEDDING_DIM = 4

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
dev_data = read_data('ner.dev')
test_data = read_data('ner.test')
train_data = read_data('ner.train')
print(f"Train size: {len(train_data)}, Dev size: {len(dev_data)}, Test size: {len(test_data)}")

# Create dictionaries
word_2_idx = {}
# Add special token for unknown words
word_2_idx['<UNK>'] = 0

for sentence in train_data:
    for word in sentence['tokens']:
        if word not in word_2_idx:
            word_2_idx[word] = len(word_2_idx)

tag_2_idx = {}
for sentence in train_data + dev_data + test_data:
    for word in sentence['gold_tags']:
        if word not in tag_2_idx:
            tag_2_idx[word] = len(tag_2_idx)

print(f"Vocabulary size: {len(word_2_idx)}, Tag set size: {len(tag_2_idx)}")

# Initialize tensorboard writer
writer = SummaryWriter('runs/structured_svm')

# Function to train with structured SVM and early stopping
def train_structured_svm(model, train_data, dev_data, learning_rates, reg_strengths, 
                         cost_scale=10.0, batch_size=32, max_epochs=30, patience=3):
    """
    Train a model using structured SVM with early stopping
    """
    results = {}
    best_f1 = 0
    best_model = None
    best_lr = None
    best_reg = None
    
    # Try different combinations of learning rates and regularization strengths
    for lr in learning_rates:
        for reg_strength in reg_strengths:
            print(f"\n=== Training with lr={lr}, reg_strength={reg_strength} ===")
            
            # Initialize a new model for each combination
            current_model = BiLSTM_CRF(len(word_2_idx), tag_2_idx, EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM)
            current_model.to(device)
            
            # Setup optimizer
            optimizer = optim.SGD(current_model.parameters(), lr=lr)
            
            # Variables for early stopping
            best_dev_f1 = 0
            epochs_no_improve = 0
            epoch_results = []
            
            for epoch in range(max_epochs):
                # Training
                current_model.train()
                total_loss = 0
                minibatches = generate_minibatches(train_data, batch_size)
                
                for i, minibatch in enumerate(tqdm(minibatches, desc=f"Epoch {epoch+1}/{max_epochs}")):
                    current_model.zero_grad()
                    
                    # Calculate loss for each sentence in the batch
                    batch_loss = 0
                    for sentence in minibatch:
                        # Skip empty sentences if any
                        if len(sentence['tokens']) <= 2:  # Only START and STOP
                            continue
                            
                        sentence_in = prepare_sequence(sentence['tokens'], word_2_idx).to(device)
                        tags = torch.tensor([tag_2_idx[t] for t in sentence['gold_tags']], 
                                            dtype=torch.long).to(device)
                        
                        # Skip if lengths don't match (should not happen)
                        if len(sentence_in) != len(tags):
                            print(f"Warning: Sentence length mismatch: {len(sentence_in)} vs {len(tags)}")
                            continue
                        
                        # Structured SVM loss
                        loss = current_model.structured_svm_loss(sentence_in, tags, cost_scale)
                        batch_loss += loss
                    
                    # Skip empty batches
                    if batch_loss == 0:
                        continue
                        
                    # Average loss over batch
                    batch_loss = batch_loss / len(minibatch)
                    
                    # Add L2 regularization
                    l2_reg = 0.0
                    for param in current_model.parameters():
                        l2_reg += torch.norm(param, 2)
                    batch_loss += reg_strength * l2_reg
                    
                    # Backpropagation
                    batch_loss.backward()
                    optimizer.step()
                    
                    total_loss += batch_loss.item()
                    
                    # Log training loss
                    writer.add_scalar(f'Loss/train_lr{lr}_reg{reg_strength}', 
                                      batch_loss.item(), 
                                      epoch * len(minibatches) + i)
                
                avg_loss = total_loss / len(minibatches) if minibatches else 0
                print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")
                
                # Evaluate on dev set
                dev_preds, dev_precision, dev_recall, dev_f1 = evaluate_model(
                    current_model, dev_data, word_2_idx, tag_2_idx)
                
                print(f"Dev - Precision: {dev_precision:.4f}, Recall: {dev_recall:.4f}, F1: {dev_f1:.4f}")
                
                # Log dev metrics
                writer.add_scalar(f'Precision/dev_lr{lr}_reg{reg_strength}', dev_precision, epoch)
                writer.add_scalar(f'Recall/dev_lr{lr}_reg{reg_strength}', dev_recall, epoch)
                writer.add_scalar(f'F1/dev_lr{lr}_reg{reg_strength}', dev_f1, epoch)
                
                # Save epoch results
                epoch_results.append({
                    'epoch': epoch + 1,
                    'loss': avg_loss,
                    'dev_precision': dev_precision,
                    'dev_recall': dev_recall,
                    'dev_f1': dev_f1
                })
                
                # Check for improvement
                if dev_f1 > best_dev_f1:
                    best_dev_f1 = dev_f1
                    epochs_no_improve = 0
                    # Save model
                    model_path = f'bilstm_crf_svm_lr{lr}_reg{reg_strength}_epoch{epoch+1}.pth'
                    torch.save(current_model.state_dict(), model_path)
                    print(f"Model saved to {model_path}")
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve >= patience:
                        print(f"Early stopping at epoch {epoch+1}")
                        break
            
            # Get best epoch result
            best_epoch_idx = max(range(len(epoch_results)), 
                                key=lambda i: epoch_results[i]['dev_f1'])
            best_epoch_result = epoch_results[best_epoch_idx]
            
            # Record results for this configuration
            results[(lr, reg_strength)] = {
                'best_epoch': best_epoch_result['epoch'],
                'best_precision': best_epoch_result['dev_precision'],
                'best_recall': best_epoch_result['dev_recall'],
                'best_f1': best_epoch_result['dev_f1'],
                'epoch_results': epoch_results
            }
            
            # Update overall best model if this configuration is better
            if best_epoch_result['dev_f1'] > best_f1:
                best_f1 = best_epoch_result['dev_f1']
                best_lr = lr
                best_reg = reg_strength
                
                # Load the best model from this configuration
                best_model = BiLSTM_CRF(len(word_2_idx), tag_2_idx, EMBEDDING_DIM, HIDDEN_DIM, CHAR_EMBEDDING_DIM)
                best_model.load_state_dict(torch.load(
                    f'bilstm_crf_svm_lr{lr}_reg{reg_strength}_epoch{best_epoch_result["epoch"]}.pth'))
                best_model.to(device)
    
    return best_model, best_lr, best_reg, best_f1, results

# Function to plot results
def plot_results(results, learning_rates, reg_strengths):
    """
    Plot results of parameter tuning
    """
    # Create a matrix of F1 scores
    f1_matrix = np.zeros((len(learning_rates), len(reg_strengths)))
    
    for i, lr in enumerate(learning_rates):
        for j, reg in enumerate(reg_strengths):
            f1_matrix[i, j] = results.get((lr, reg), {}).get('best_f1', 0)
    
    plt.figure(figsize=(10, 8))
    plt.imshow(f1_matrix, interpolation='nearest', cmap='viridis')
    plt.title('F1 Score by Learning Rate and Regularization Strength')
    plt.xlabel('Regularization Strength')
    plt.ylabel('Learning Rate')
    
    # Add colorbar
    plt.colorbar(label='F1 Score')
    
    # Add axis labels
    plt.xticks(np.arange(len(reg_strengths)), reg_strengths)
    plt.yticks(np.arange(len(learning_rates)), learning_rates)
    
    # Add text annotations
    for i in range(len(learning_rates)):
        for j in range(len(reg_strengths)):
            plt.text(j, i, f'{f1_matrix[i, j]:.3f}', 
                     ha="center", va="center", color="white" if f1_matrix[i, j] < 0.5 else "black")
    
    plt.tight_layout()
    plt.savefig('parameter_tuning_results.png')
    plt.close()
    
    # Plot precision, recall, and F1 for best configuration
    best_config = max(results.keys(), key=lambda k: results[k]['best_f1'])
    if best_config:
        best_lr, best_reg = best_config
        epoch_results = results[(best_lr, best_reg)]['epoch_results']
        
        epochs = [r['epoch'] for r in epoch_results]
        precision = [r['dev_precision'] for r in epoch_results]
        recall = [r['dev_recall'] for r in epoch_results]
        f1 = [r['dev_f1'] for r in epoch_results]
        loss = [r['loss'] for r in epoch_results]
        
        plt.figure(figsize=(12, 5))
        
        plt.subplot(1, 2, 1)
        plt.plot(epochs, precision, marker='o', label='Precision')
        plt.plot(epochs, recall, marker='s', label='Recall')
        plt.plot(epochs, f1, marker='^', label='F1')
        plt.xlabel('Epoch')
        plt.ylabel('Score')
        plt.title(f'Metrics (lr={best_lr}, reg={best_reg})')
        plt.legend()
        plt.grid(True)
        
        plt.subplot(1, 2, 2)
        plt.plot(epochs, loss, marker='o', color='red')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss')
        plt.grid(True)
        
        plt.tight_layout()
        plt.savefig('best_model_training_curves.png')
        plt.close()

# Main execution
if __name__ == "__main__":
    # Parameters to tune - using more conservative values
    learning_rates = [0.001]
    reg_strengths = [0.01, 0.001, 0.05]
    
    # Train model with structured SVM and early stopping
    best_model, best_lr, best_reg, best_f1, results = train_structured_svm(
        None, train_data, dev_data, 
        learning_rates, reg_strengths, 
        cost_scale=10.0, batch_size=1024,  
        max_epochs=5, patience=1
    )
    
    # Print tuning results
    print("\nParameter Tuning Results:")
    for (lr, reg), result in results.items():
        print(f"lr={lr}, reg={reg}: Precision={result['best_precision']:.4f}, "
              f"Recall={result['best_recall']:.4f}, F1={result['best_f1']:.4f}")
    
    print(f"\nBest Configuration: lr={best_lr}, reg={best_reg}, F1={best_f1:.4f}")
    
    # Plot results
    plot_results(results, learning_rates, reg_strengths)
    
    # Evaluate best model on test set
    print("\nEvaluating best model on test set...")
    test_preds, test_precision, test_recall, test_f1 = evaluate_model(
        best_model, test_data, word_2_idx, tag_2_idx)
    
    print(f"Test Results: Precision={test_precision:.4f}, Recall={test_recall:.4f}, F1={test_f1:.4f}")
    
    # Generate output files
    # Get predictions for dev set
    dev_preds, _, _, _ = evaluate_model(best_model, dev_data, word_2_idx, tag_2_idx)
    
    # Generate output files
    generate_output_file(dev_preds, dev_data, tag_2_idx, 'dev_predictions_svm.txt')
    generate_output_file(test_preds, test_data, tag_2_idx, 'test_predictions_svm.txt')
    
    # Save best model
    torch.save(best_model.state_dict(), 'best_bilstm_crf_svm_model.pth')
    
    # Generate report
    with open('structured_svm_report.txt', 'w') as f:
        f.write("# Structured SVM Training Report\n\n")
        
        f.write("## Model Description\n")
        f.write("The model is a BiLSTM-CRF with structured SVM training using a Hamming distance cost function.\n")
        f.write(f"- Embedding dimension: {EMBEDDING_DIM}\n")
        f.write(f"- Hidden dimension: {HIDDEN_DIM}\n")
        f.write(f"- Cost scale: 10.0 (multiplier for Hamming distance)\n\n")
        
        f.write("## Training Details\n")
        f.write("- Implemented early stopping based on F1 score on dev set\n")
        f.write("- Used L2 regularization\n")
        f.write("- Cost-augmented decoding during training\n\n")
        
        f.write("## Parameter Tuning Results\n")
        f.write("| Learning Rate | Reg Strength | Precision | Recall | F1 |\n")
        f.write("|---------------|--------------|-----------|--------|----|\n")
        
        for (lr, reg), result in results.items():
            f.write(f"| {lr} | {reg} | {result['best_precision']:.4f} | {result['best_recall']:.4f} | {result['best_f1']:.4f} |\n")
        
        f.write(f"\nBest configuration: lr={best_lr}, reg={best_reg}, F1={best_f1:.4f}\n\n")
        
        f.write("## Final Results\n")
        f.write(f"Dev set: Precision={best_f1:.4f}, Recall={results[(best_lr, best_reg)]['best_recall']:.4f}, F1={results[(best_lr, best_reg)]['best_f1']:.4f}\n")
        f.write(f"Test set: Precision={test_precision:.4f}, Recall={test_recall:.4f}, F1={test_f1:.4f}\n")
        
    print("Done! See structured_svm_report.txt for complete results.")

Using device: cuda
Train size: 14987, Dev size: 3466, Test size: 3684
Vocabulary size: 23627, Tag set size: 10

=== Training with lr=0.001, reg_strength=0.01 ===


Epoch 1/5: 100%|██████████| 15/15 [11:50<00:00, 47.37s/it]


Epoch 1, Average Loss: 20604.8952


Evaluating: 100%|██████████| 3466/3466 [00:29<00:00, 118.65it/s]


Dev - Precision: 0.0452, Recall: 0.1695, F1: 0.0714
Model saved to bilstm_crf_svm_lr0.001_reg0.01_epoch1.pth


Epoch 2/5: 100%|██████████| 15/15 [11:50<00:00, 47.35s/it]


Epoch 2, Average Loss: 20600.7046


Evaluating: 100%|██████████| 3466/3466 [00:29<00:00, 118.70it/s]


Dev - Precision: 0.0957, Recall: 0.0378, F1: 0.0542
Early stopping at epoch 2

=== Training with lr=0.001, reg_strength=0.001 ===


Epoch 1/5: 100%|██████████| 15/15 [11:50<00:00, 47.38s/it]


Epoch 1, Average Loss: 20231.9301


Evaluating: 100%|██████████| 3466/3466 [00:29<00:00, 118.76it/s]


Dev - Precision: 0.0238, Recall: 0.1430, F1: 0.0409
Model saved to bilstm_crf_svm_lr0.001_reg0.001_epoch1.pth


Epoch 2/5: 100%|██████████| 15/15 [11:52<00:00, 47.49s/it]


Epoch 2, Average Loss: 20225.9436


Evaluating: 100%|██████████| 3466/3466 [00:29<00:00, 117.88it/s]


Dev - Precision: 0.0241, Recall: 0.1447, F1: 0.0414
Model saved to bilstm_crf_svm_lr0.001_reg0.001_epoch2.pth


Epoch 3/5: 100%|██████████| 15/15 [11:51<00:00, 47.44s/it]


Epoch 3, Average Loss: 20220.0174


Evaluating: 100%|██████████| 3466/3466 [00:28<00:00, 119.76it/s]


Dev - Precision: 0.0242, Recall: 0.1447, F1: 0.0415
Model saved to bilstm_crf_svm_lr0.001_reg0.001_epoch3.pth


Epoch 4/5: 100%|██████████| 15/15 [11:57<00:00, 47.85s/it]


Epoch 4, Average Loss: 20214.1664


Evaluating: 100%|██████████| 3466/3466 [00:30<00:00, 115.11it/s]


Dev - Precision: 0.0243, Recall: 0.1431, F1: 0.0415
Model saved to bilstm_crf_svm_lr0.001_reg0.001_epoch4.pth


Epoch 5/5: 100%|██████████| 15/15 [10:53<00:00, 43.59s/it]


Epoch 5, Average Loss: 20208.4385


Evaluating: 100%|██████████| 3466/3466 [00:30<00:00, 115.16it/s]


Dev - Precision: 0.0267, Recall: 0.1420, F1: 0.0450
Model saved to bilstm_crf_svm_lr0.001_reg0.001_epoch5.pth

=== Training with lr=0.001, reg_strength=0.05 ===


Epoch 1/5: 100%|██████████| 15/15 [10:55<00:00, 43.72s/it]


Epoch 1, Average Loss: 22382.2596


Evaluating: 100%|██████████| 3466/3466 [00:39<00:00, 87.35it/s] 


Dev - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Early stopping at epoch 1

Parameter Tuning Results:
lr=0.001, reg=0.01: Precision=0.0452, Recall=0.1695, F1=0.0714
lr=0.001, reg=0.001: Precision=0.0267, Recall=0.1420, F1=0.0450
lr=0.001, reg=0.05: Precision=0.0000, Recall=0.0000, F1=0.0000

Best Configuration: lr=0.001, reg=0.01, F1=0.0714

Evaluating best model on test set...


Evaluating: 100%|██████████| 3684/3684 [00:37<00:00, 99.43it/s] 


Test Results: Precision=0.0450, Recall=0.1590, F1=0.0702


Evaluating: 100%|██████████| 3466/3466 [00:39<00:00, 87.20it/s] 


Done! See structured_svm_report.txt for complete results.
