In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook
from sklearn.model_selection import train_test_split
from seqeval.metrics import classification_report, f1_score
from seqeval.scheme import IOB2
import os
import zipfile
import requests

# Install GloVe if not already present
if not os.path.exists('glove.6B.zip'):
    print("Downloading GloVe embeddings...")
    url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    response = requests.get(url)
    with open('glove.6B.zip', 'wb') as f:
        f.write(response.content)
    with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
        zip_ref.extractall()

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Check if GloVe is already loaded
glove_path = '/content/glove.6B.300d.txt'
if not os.path.exists(glove_path):
    print("GloVe file not found. Please make sure it is downloaded and extracted.")
else:
    glove_embeddings = load_glove_embeddings(glove_path)

embedding_dim = 300  # Dimensionality of word embeddings (e.g., 300d for GloVe)

# VocabularyEmbedding Class to handle word indices and padding
class VocabularyEmbedding:
    def __init__(self, glove_embeddings, embedding_dim=300):
        self.embedding_dim = embedding_dim
        self.embeddings = glove_embeddings
        self.pad_index = 0
        self.unk_index = 1
        self.word2idx = {'<pad>': self.pad_index, '<unk>': self.unk_index}
        self.idx2word = {self.pad_index: '<pad>', self.unk_index: '<unk>'}
        self.vectors = [np.zeros(embedding_dim), np.random.uniform(-1, 1, embedding_dim)]

        for word, vector in glove_embeddings.items():
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
            self.vectors.append(vector)

        self.vectors = torch.FloatTensor(self.vectors)

    def encode(self, sentence):
        return [self.word2idx.get(word, self.unk_index) for word in sentence.split()]

    def create_padded_tensor(self, sequences):
        lengths = [len(seq) for seq in sequences]
        max_len = max(lengths)
        padded = torch.full((len(sequences), max_len), self.pad_index, dtype=torch.long)
        for i, seq in enumerate(sequences):
            padded[i, :len(seq)] = torch.tensor(seq, dtype=torch.long)
        return padded, lengths

# BIOTagSequencer for encoding tags
class BIOTagSequencer:
    def __init__(self, tag_corpus):
        self.word2idx = {'O': 0}
        self.idx2word = {0: 'O'}
        for tags in tag_corpus:
            for tag in tags.split():
                if tag not in self.word2idx:
                    idx = len(self.word2idx)
                    self.word2idx[tag] = idx
                    self.idx2word[idx] = tag

    def encode(self, tags):
        return [self.word2idx[tag] for tag in tags.split()]

    def create_padded_tensor(self, sequences):
        lengths = [len(seq) for seq in sequences]
        max_len = max(lengths)
        padded = torch.full((len(sequences), max_len), 0, dtype=torch.long)
        for i, seq in enumerate(sequences):
            padded[i, :len(seq)] = torch.tensor(seq, dtype=torch.long)
        return padded, lengths

# NERDataset for data loading
class NERDataset(Dataset):
    def __init__(self, data, text_sequencer, tag_sequencer):
        self.data = data
        self.text_sequencer = text_sequencer
        self.tag_sequencer = tag_sequencer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text, tags = self.data[index]
        x = self.text_sequencer.encode(text)
        y = self.tag_sequencer.encode(tags)
        return x, y

# Custom collate function for padding
def collate_fn(batch):
    texts, tags = zip(*batch)
    padded_texts, text_lengths = text_sequencer.create_padded_tensor(texts)
    padded_tags, _ = tag_sequencer.create_padded_tensor(tags)
    return padded_texts, padded_tags, text_lengths

# BiLSTM Model for NER
class BiLSTMModel(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_dim, vocab_size, embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_size)

    def forward(self, x, lengths):
        x = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_out, _ = self.lstm(packed)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        return self.fc(output)

# Initialize model, data, and training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
text_sequencer = VocabularyEmbedding(glove_embeddings)
train_csv_path = "/content/hw2_train.csv"
df = pd.read_csv(train_csv_path)
df['IOB Slot tags'] = df['IOB Slot tags'].replace(np.nan, "O")
item_list = [(row['utterances'], row['IOB Slot tags']) for _, row in df.iterrows()]
train_data, validation_data = train_test_split(item_list, test_size=0.1, random_state=42)
tag_sequencer = BIOTagSequencer([tags for _, tags in train_data])

train_dataset = NERDataset(train_data, text_sequencer, tag_sequencer)
val_dataset = NERDataset(validation_data, text_sequencer, tag_sequencer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

output_size = len(tag_sequencer.idx2word)
hidden_dim = 128
model = BiLSTMModel(output_size, embedding_dim, hidden_dim, len(text_sequencer.word2idx), text_sequencer.vectors).to(device)

optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss()

# Training loop
for epoch in range(50):
    model.train()
    for x_batch, y_batch, lengths in tqdm_notebook(train_loader, desc=f"Epoch {epoch+1}", leave=False):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(x_batch, lengths)
        logits_flat = logits.view(-1, logits.shape[-1])
        y_batch_flat = y_batch.view(-1)
        loss = loss_function(logits_flat, y_batch_flat)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed. Loss: {loss.item()}")

# Evaluate model on validation data
def evaluate_model(model, data_loader, device, tag_sequencer):
    model.eval()
    true_tags = []
    pred_tags = []

    with torch.no_grad():
        for x_batch, y_batch, lengths in data_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            logits = model(x_batch, lengths)
            predictions = torch.argmax(logits, dim=2)

            for i in range(len(y_batch)):
                true_tag_seq = [tag_sequencer.idx2word[idx.item()] for idx in y_batch[i][:lengths[i]]]
                pred_tag_seq = [tag_sequencer.idx2word[idx.item()] for idx in predictions[i][:lengths[i]]]

                true_tags.append(true_tag_seq)
                pred_tags.append(pred_tag_seq)

    print("Classification Report:")
    print(classification_report(true_tags, pred_tags, scheme=IOB2))
    f1 = f1_score(true_tags, pred_tags)
    print(f"F1 Score: {f1:.4f}")

    return f1

# Evaluate on validation set
evaluate_model(model, val_loader, device, tag_sequencer)

# Generate submission file
def generate_submission(model, test_data, text_sequencer, tag_sequencer, device, output_csv='submission.csv'):
    model.eval()
    results = []

    with torch.no_grad():
        for idx, (text, _) in enumerate(test_data, start = 1):
            encoded_sentence = text_sequencer.encode(text)
            x_tensor = torch.tensor(encoded_sentence).unsqueeze(0).to(device)
            logits = model(x_tensor, [len(encoded_sentence)])
            predicted_tags = [tag_sequencer.idx2word[idx] for idx in torch.argmax(logits, dim=2)[0].tolist()]
            results.append((idx, ' '.join(predicted_tags)))

    pd.DataFrame(results, columns=["ID", "IOB Slot tags"]).to_csv(output_csv, index=False)

# Example usage for test data
test_df = pd.read_csv("/content/hw2_test.csv")  # Replace with actual test file path
test_data = [(row['utterances'], "") for _, row in test_df.iterrows()]
generate_submission(model, test_data, text_sequencer, tag_sequencer, device)


Epoch 1:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 1 completed. Loss: 2.0732531547546387


Epoch 2:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 2 completed. Loss: 1.2607998847961426


Epoch 3:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 3 completed. Loss: 1.7335968017578125


Epoch 4:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 4 completed. Loss: 1.5457748174667358


Epoch 5:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 5 completed. Loss: 1.17100191116333


Epoch 6:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 6 completed. Loss: 1.1227459907531738


Epoch 7:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 7 completed. Loss: 1.1112638711929321


Epoch 8:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 8 completed. Loss: 1.2086373567581177


Epoch 9:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 9 completed. Loss: 0.7875458002090454


Epoch 10:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 10 completed. Loss: 0.8569512367248535


Epoch 11:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 11 completed. Loss: 0.9171021580696106


Epoch 12:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 12 completed. Loss: 0.8637276887893677


Epoch 13:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 13 completed. Loss: 0.9278011918067932


Epoch 14:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 14 completed. Loss: 0.7929292321205139


Epoch 15:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 15 completed. Loss: 0.6668012738227844


Epoch 16:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 16 completed. Loss: 0.7513063549995422


Epoch 17:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 17 completed. Loss: 0.6880166530609131


Epoch 18:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 18 completed. Loss: 0.7294324040412903


Epoch 19:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 19 completed. Loss: 0.49792107939720154


Epoch 20:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 20 completed. Loss: 0.6661807894706726


Epoch 21:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 21 completed. Loss: 0.5303353667259216


Epoch 22:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 22 completed. Loss: 0.602485716342926


Epoch 23:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 23 completed. Loss: 0.47347763180732727


Epoch 24:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 24 completed. Loss: 0.6072441935539246


Epoch 25:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 25 completed. Loss: 0.3601517677307129


Epoch 26:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 26 completed. Loss: 0.4169030785560608


Epoch 27:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 27 completed. Loss: 0.38032224774360657


Epoch 28:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 28 completed. Loss: 0.2944180369377136


Epoch 29:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 29 completed. Loss: 0.3328174650669098


Epoch 30:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 30 completed. Loss: 0.2730823755264282


Epoch 31:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 31 completed. Loss: 0.20806452631950378


Epoch 32:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 32 completed. Loss: 0.2811059057712555


Epoch 33:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 33 completed. Loss: 0.16735254228115082


Epoch 34:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 34 completed. Loss: 0.23613087832927704


Epoch 35:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 35 completed. Loss: 0.22169356048107147


Epoch 36:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 36 completed. Loss: 0.19205079972743988


Epoch 37:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 37 completed. Loss: 0.1549486517906189


Epoch 38:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 38 completed. Loss: 0.18188335001468658


Epoch 39:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 39 completed. Loss: 0.15719760954380035


Epoch 40:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 40 completed. Loss: 0.10787095129489899


Epoch 41:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 41 completed. Loss: 0.16116875410079956


Epoch 42:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 42 completed. Loss: 0.1080017015337944


Epoch 43:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 43 completed. Loss: 0.1785353422164917


Epoch 44:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 44 completed. Loss: 0.10723414272069931


Epoch 45:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 45 completed. Loss: 0.1650683432817459


Epoch 46:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 46 completed. Loss: 0.15238900482654572


Epoch 47:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 47 completed. Loss: 0.08530358225107193


Epoch 48:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 48 completed. Loss: 0.12168790400028229


Epoch 49:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 49 completed. Loss: 0.11712031066417694


Epoch 50:   0%|          | 0/65 [00:00<?, ?it/s]

Epoch 50 completed. Loss: 0.11191779375076294
Classification Report:
               precision    recall  f1-score   support

        _cast       0.62      0.62      0.62         8
        _char       1.00      1.00      1.00         1
     _country       1.00      0.92      0.96        12
    _director       0.94      1.00      0.97        15
       _genre       0.90      0.90      0.90        10
    _language       0.92      1.00      0.96        12
       _movie       0.89      0.91      0.90       116
 _mpaa_rating       1.00      1.00      1.00        13
      _person       0.87      0.76      0.81        17
    _producer       0.79      0.61      0.69        18
_release_year       0.00      0.00      0.00         1
     _subject       0.69      0.82      0.75        11

    micro avg       0.88      0.88      0.88       234
    macro avg       0.80      0.80      0.80       234
 weighted avg       0.88      0.88      0.87       234

F1 Score: 0.8779


  _warn_prf(average, modifier, msg_start, len(result))
