In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Sem 8/NLP/Mammo.csv')
data.head()

Unnamed: 0,Features,Birads
0,Soft tissue mass lesion (23 x 20 mm) with spic...,4
1,Parenchyma is predominantly FATTY.No distinctl...,1
2,Parenchyma is predominantly GLANDULAR. No dist...,1
3,Parenchyma is predominantly GLANDULAR. No dist...,2
4,Parenchyma is GLANDULAR and FATTY. Small subce...,2


In [3]:
import re
def preprocess_generation(texts):
  corpus = []
  for i in range(len(texts)):
    sentence = texts.iloc[i]
    sentence = re.sub(r'([^\w\s])', r' \1 ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    corpus.append(sentence.lower())

  return corpus

texts = preprocess_generation(data['Features'])
len(texts)

107

In [4]:
train_text, temp_text = train_test_split(texts, test_size = 0.2, random_state = 42)
val_text, test_text = train_test_split(temp_text, test_size = 0.5, random_state = 42)

In [5]:
word_to_idx = {'<PAD>':0,'<UNK>':1}
for sentence in train_text:
  for word in sentence.split():
    if word not in word_to_idx.keys():
      word_to_idx[word] = len(word_to_idx.keys())

idx_to_word = {value:key for key,value in word_to_idx.items()}
vocab_size = len(word_to_idx.keys())
print(f'Vocab Size: {vocab_size}')

Vocab Size: 308


In [6]:
sequence_len = 2

def make_sequences(sentences, sequence_len = -1):
  sequences = []
  for sentence in sentences:
    sentence = sentence.split()
    if sequence_len == -1:
      for i in range(len(sentence)-1):
        sequences.append((sentence[:i+1], sentence[i+1]))

    else:
      sentence = ['<PAD>'] * sequence_len + sentence
      for i in range(len(sentence)-sequence_len):
        sequences.append((sentence[i:i+sequence_len], sentence[i+sequence_len]))

  return sequences

def pad_sequence(sequences, max_len):
  for i in range(len(sequences)):
    if len(sequences[i][0]) < max_len:
      sequences[i] = (['<PAD>'] * (max_len - len(sequences[i][0])) + sequences[i][0], sequences[i][1])
    else:
      sequences[i] = (sequences[i][0][-max_len:], sequences[i][1])

  return sequences


def encode(sequence):
  return [word_to_idx.get(word, word_to_idx['<UNK>']) for word in sequence]

def prepare_dataset(sequences):
  X = [encode(sequence) for sequence, _ in sequences]
  y = [word_to_idx.get(word, word_to_idx['<UNK>']) for _, word in sequences]
  X = torch.tensor(X, dtype=torch.long)
  y = torch.tensor(y, dtype=torch.long)

  return TensorDataset(X, y)


train_sequences = make_sequences(train_text, sequence_len)
max_len = max([len(sequence) for sequence,_ in train_sequences])
val_sequences = make_sequences(val_text, sequence_len)
test_sequences = make_sequences(test_text, sequence_len)

if sequence_len == -1:
  train_sequences = pad_sequence(train_sequences, max_len)
  val_sequences = pad_sequence(val_sequences, max_len)
  test_sequences = pad_sequence(test_sequences, max_len)

train_dataset = prepare_dataset(train_sequences)
val_dataset = prepare_dataset(val_sequences)
test_dataset = prepare_dataset(test_sequences)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
class NextWord(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, rnn = True):
    super(NextWord, self).__init__()
    self.rnn = rnn
    self.Embedding = nn.Embedding(vocab_size, embedding_dim = embed_dim, padding_idx=0)
    if self.rnn:
      self.model = nn.RNN(embed_dim, hidden_dim, batch_first = True, num_layers = 1)
    else:
      self.model = nn.LSTM(embed_dim, hidden_dim, batch_first = True, num_layers = 1)

    self.linear = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    x = self.Embedding(x)
    if self.rnn:
      _, h_n = self.model(x)
    else:
      _, (h_n, _) = self.model(x)
    x = self.linear(h_n[-1])
    return x

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_rnn = NextWord(vocab_size, 100, 128, 1, True).to(device)

optim = torch.optim.Adam(model_rnn.parameters(), lr = 0.001)
loss = nn.CrossEntropyLoss()

In [None]:
def train(model, loader):
  model.train()
  total_loss = 0

  for X, y in loader:
    X = X.to(device)
    y = y.to(device)
    optim.zero_grad()

    output = model(X)
    losses = loss(output, y)
    total_loss += losses.item()
    losses.backward()
    optim.step()

  return total_loss / len(loader)


In [None]:
def evaluate(model, loader):
  model.eval()
  tot_loss = 0
  tot_correct = 0
  total = 0

  for X, y in loader:
    X = X.to(device)
    y = y.to(device)
    outputs = model(X)
    losses = loss(outputs, y)

    pred = outputs.argmax(axis = 1)
    tot_correct += (pred == y).sum().item()
    total += len(y)
    tot_loss += losses.item()

  return torch.math.exp(torch.tensor(tot_loss / len(loader))), tot_correct / total

In [None]:
for epoch in range(10):
  train_loss = train(model_rnn, train_loader)
  val_perp, val_acc = evaluate(model_rnn, val_loader)

  print(f'Epoch {epoch+1} => Val Perp = {val_perp:.4f} Val Accuracy = {val_acc:.4f}')

test_perp, test_acc = evaluate(model_rnn, test_loader)
print(f'Test Perp = {test_perp:.4f} Test Accuracy = {test_acc:.4f}')

Epoch 1 => Val Perp = 5.5975 Val Accuracy = 0.6597
Epoch 2 => Val Perp = 4.3715 Val Accuracy = 0.6930
Epoch 3 => Val Perp = 3.9895 Val Accuracy = 0.7250
Epoch 4 => Val Perp = 3.8743 Val Accuracy = 0.7078
Epoch 5 => Val Perp = 3.8378 Val Accuracy = 0.7139
Epoch 6 => Val Perp = 3.7858 Val Accuracy = 0.7016
Epoch 7 => Val Perp = 3.8500 Val Accuracy = 0.7115
Epoch 8 => Val Perp = 3.7441 Val Accuracy = 0.7164
Epoch 9 => Val Perp = 3.7785 Val Accuracy = 0.7115
Epoch 10 => Val Perp = 3.8880 Val Accuracy = 0.7102
Test Perp = 2.6236 Test Accuracy = 0.7457


In [None]:
model_rnn.eval()

seed_text = 'Parenchyma is a '
seed_seq = encode(seed_text.lower().split())
generation_len = 100

with torch.no_grad():
    for _ in range(generation_len):

        # LEFT PAD to max_len
        padded = [word_to_idx['<PAD>']] * (max_len - len(seed_seq)) + seed_seq
        padded = padded[-max_len:]

        seed_seq_t = torch.tensor(padded).unsqueeze(0).to(device)

        output = model_rnn(seed_seq_t)          # (1, vocab)
        next_word_idx = output.argmax(dim=1).item()

        seed_seq.append(next_word_idx)

sentence = ' '.join(idx_to_word[idx] for idx in seed_seq)
print(sentence)


parenchyma is a well defined radio opaque mass . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary lymphnodes seen . normal . no enlargement of axillary
