In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

# Load data
def load_data():
    train_df = pd.read_csv('/content/hw2_train.csv')
    test_df = pd.read_csv('/content/hw2_test.csv')
    return train_df, test_df

train_df, test_df = load_data()
train_utterances = train_df['utterances'].apply(lambda x: x.split()).tolist()
test_utterances = test_df['utterances'].apply(lambda x: x.split()).tolist()
train_tags = [tags.split() for tags in train_df['IOB Slot tags'].tolist()]

# Create vocabularies
def create_vocab(utterances, tags):
    token_vocab = {word: idx for idx, word in enumerate(set([word for utterance in utterances for word in utterance]))}
    tag_vocab = {tag: idx for idx, tag in enumerate(set([tag for tag_seq in tags for tag in tag_seq]))}
    return token_vocab, tag_vocab

token_vocab, tag_vocab = create_vocab(train_utterances, train_tags)

# Dataset class
class SlotTaggingDataset(Dataset):
    def __init__(self, utterances, tags, token_vocab, tag_vocab):
        self.utterances = utterances
        self.tags = tags
        self.token_vocab = token_vocab
        self.tag_vocab = tag_vocab

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utterance = self.utterances[idx]
        tag = self.tags[idx]
        input_tensor = torch.tensor([self.token_vocab.get(token, 0) for token in utterance], dtype=torch.long)
        target_tensor = torch.tensor([self.tag_vocab.get(t, 0) for t in tag], dtype=torch.long)
        return input_tensor, target_tensor

# Padding function
def collate_fn(batch):
    input_seqs, target_seqs = zip(*batch)
    input_seqs_padded = pad_sequence(input_seqs, batch_first=True, padding_value=0)
    target_seqs_padded = pad_sequence(target_seqs, batch_first=True, padding_value=0)
    return input_seqs_padded, target_seqs_padded

# Model definition
class SlotTaggingLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1, dropout=0.1):
        super(SlotTaggingLSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output

# Initialize model
def initialize_model(input_dim, output_dim):
    model = SlotTaggingLSTM(input_dim, hidden_dim=128, output_dim=output_dim, num_layers=1, dropout=0.1)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    return model, criterion, optimizer, device

# Training function
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for input_tensor, target_tensor in train_loader:
            input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

            optimizer.zero_grad()
            output = model(input_tensor)

            output = output.view(-1, len(tag_vocab))
            target_tensor = target_tensor.view(-1)

            loss = criterion(output, target_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_true = []

    for input_tensor, true_tags in test_loader:
        input_tensor = input_tensor.to(device)
        true_tags = true_tags.to(device)

        with torch.no_grad():
            outputs = model(input_tensor)

        pred_tags = torch.argmax(outputs, dim=-1).cpu().numpy()
        true_tags = true_tags.cpu().numpy()

        all_preds.extend(pred_tags.flatten())
        all_true.extend(true_tags.flatten())

    f1 = f1_score(all_true, all_preds, average='weighted')
    print(f"F1 Score: {f1:.3f}")
    return all_preds

# Prepare submission
def prepare_submission(all_preds, test_utterances):
    all_preds_reshaped = []
    start_idx = 0
    for i, utterance in enumerate(test_utterances):
        seq_len = len(utterance)
        all_preds_reshaped.append(all_preds[start_idx:start_idx + seq_len])
        start_idx += seq_len

    submission_df = pd.DataFrame({
        'ID': test_df['ID'],
        'IOB Slot Tags': [' '.join([list(tag_vocab.keys())[pred] for pred in pred_seq]) for pred_seq in all_preds_reshaped]
    })

    submission_df.to_csv('/content/slot_tagging_submission.csv', index=False)
    print("Submission file saved as 'slot_tagging_submission.csv'")

# Main code
train_dataset = SlotTaggingDataset(train_utterances, train_tags, token_vocab, tag_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model, criterion, optimizer, device = initialize_model(len(token_vocab), len(tag_vocab))

train_model(model, train_loader, criterion, optimizer, device, num_epochs=10)

test_tags = [['O'] * len(utterance) for utterance in test_utterances]
test_dataset = SlotTaggingDataset(test_utterances, test_tags, token_vocab, tag_vocab)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

all_preds = evaluate_model(model, test_loader, device)

prepare_submission(all_preds, test_utterances)




Epoch 1, Loss: 1.7570036641425557
Epoch 2, Loss: 0.7685506749484274
Epoch 3, Loss: 0.4977883601354228
Epoch 4, Loss: 0.3466450195345614
Epoch 5, Loss: 0.2507910542190075
Epoch 6, Loss: 0.19179138779226276
Epoch 7, Loss: 0.15101299481466413
Epoch 8, Loss: 0.12378427701898748
Epoch 9, Loss: 0.10121867852285504
Epoch 10, Loss: 0.08482914655986759
F1 Score: 0.276
Submission file saved as 'slot_tagging_submission.csv'
