In [1]:
import warnings
import pandas as pd
import ast
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from sklearn.metrics import accuracy_score, classification_report

# Suppress all warnings
warnings.filterwarnings("ignore")

# Load and preprocess dataset
file_path = 'NER_Dataset.csv'
data = pd.read_csv(file_path)

def convert_string_to_list(row):
    row['Word'] = ast.literal_eval(row['Word'])
    row['POS'] = ast.literal_eval(row['POS'])
    row['Tag'] = ast.literal_eval(row['Tag'])
    return row

data = data.apply(convert_string_to_list, axis=1)

# Extract unique words and tags
unique_words = set()
unique_tags = set()
for _, row in data.iterrows():
    unique_words.update(row['Word'])
    unique_tags.update(row['Tag'])

word_to_ix = {word: i for i, word in enumerate(unique_words, start=1)}
word_to_ix['<UNK>'] = 0  # Unknown words
tag_to_ix = {tag: i for i, tag in enumerate(unique_tags)}

if 'O' not in tag_to_ix:
    tag_to_ix['O'] = len(tag_to_ix)

ix_to_tag = {ix: tag for tag, ix in tag_to_ix.items()}

class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = [[word_to_ix.get(word, 0) for word in sentence] for sentence in sentences]
        self.tags = [[tag_to_ix[tag] for tag in tag_seq] for tag_seq in tags]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return torch.tensor(self.sentences[idx], dtype=torch.long), torch.tensor(self.tags[idx], dtype=torch.long)

def pad_collate(batch):
    (sentences, tags) = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=0)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag_to_ix['O'])
    return sentences_padded, tags_padded

class CNNForNER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=64, num_filters=128, kernel_size=3):
        super(CNNForNER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size, padding=1)
        self.fc = nn.Linear(num_filters, tagset_size)

    def forward(self, sentences):
        x = self.embedding(sentences)
        x = x.permute(0, 2, 1)  # Change to (batch, channels, sequence length)
        x = self.conv1d(x)
        x = torch.relu(x)
        x = x.permute(0, 2, 1)  # Back to (batch, sequence length, channels)
        x = self.fc(x)
        return torch.log_softmax(x, dim=2)

# Prepare data for training and evaluation
sentences = [row['Word'] for _, row in data.iterrows()]
tags = [row['Tag'] for _, row in data.iterrows()]

train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, tags, test_size=0.2, random_state=42)

train_dataset = NERDataset(train_sentences, train_tags, word_to_ix, tag_to_ix)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

test_dataset = NERDataset(test_sentences, test_tags, word_to_ix, tag_to_ix)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

# Model, optimizer, and loss function
model = CNNForNER(len(word_to_ix), len(tag_to_ix))
optimizer = Adam(model.parameters())
loss_function = nn.CrossEntropyLoss()

# Function to evaluate the model
def evaluate_model(model, data_loader):
    model.eval()
    true_tags, pred_tags = [], []
    with torch.no_grad():
        for sentences, tags in data_loader:
            tag_scores = model(sentences)
            predictions = torch.argmax(tag_scores, dim=2)
            true_tags.extend(tags.view(-1).tolist())
            pred_tags.extend(predictions.view(-1).tolist())
    
    true_tags = [ix_to_tag[ix] for ix in true_tags if ix in ix_to_tag]
    pred_tags = [ix_to_tag[ix] for ix in pred_tags if ix in ix_to_tag]

    accuracy = accuracy_score(true_tags, pred_tags)
    report = classification_report(true_tags, pred_tags, labels=list(tag_to_ix.values()), target_names=list(tag_to_ix.keys()))

    return accuracy, report

# Training and evaluation loop
for epoch in range(10):
    model.train()
    total_loss = 0
    for sentences, tags in train_loader:
        optimizer.zero_grad()
        tag_scores = model(sentences)
        loss = loss_function(tag_scores.view(-1, len(tag_to_ix)), tags.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    accuracy, report = evaluate_model(model, test_loader)
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Accuracy: {accuracy}")


Epoch 1, Loss: 0.23542172638536296, Accuracy: 0.9646650997618852
Epoch 2, Loss: 0.10704171599772097, Accuracy: 0.9725115978323343
Epoch 3, Loss: 0.08113832480807767, Accuracy: 0.9754418466212332
Epoch 4, Loss: 0.06716599453821642, Accuracy: 0.9770250020527137
Epoch 5, Loss: 0.057652055732501956, Accuracy: 0.9779564208884145
Epoch 6, Loss: 0.050813708033372304, Accuracy: 0.978628684621069
Epoch 7, Loss: 0.045790899767881386, Accuracy: 0.9787621110107563
Epoch 8, Loss: 0.041367210593817136, Accuracy: 0.9787646769028656
Epoch 9, Loss: 0.0377495669393961, Accuracy: 0.9790315296822399
Epoch 10, Loss: 0.034818011379526545, Accuracy: 0.9794549018802857
