In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dropout, Conv1D, Bidirectional, LSTM, Dense
from tensorflow.keras.layers import Layer, MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K
import numpy as np

class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")        
        super(CustomAttention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# Load the dataset
data = pd.read_csv('MT.csv')
data.fillna('', inplace=True)

# Prepare text data for tokenization
texts = pd.concat([data['src'], data['hyp'], data['tgt']])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert text columns to sequences of integers
hyp_sequences = tokenizer.texts_to_sequences(data['hyp'])
tgt_sequences = tokenizer.texts_to_sequences(data['tgt'])
max_len_hyp = max(len(seq) for seq in hyp_sequences)
max_len_tgt = max(len(seq) for seq in tgt_sequences)
max_len = max(max_len_hyp, max_len_tgt)

hyp_padded = pad_sequences(hyp_sequences, maxlen=max_len)
tgt_padded = pad_sequences(tgt_sequences, maxlen=max_len)

# Combine hyp and tgt sequences
X_combined = np.concatenate([hyp_padded, tgt_padded], axis=1)

# Prepare labels
labels = data['label'].values

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

# Model definition
input_text = Input(shape=(2 * max_len,))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100)(input_text)
x = Conv1D(64, 5, activation='relu')(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = MultiHeadAttention(num_heads=2, key_dim=128)(x, x)
x = CustomAttention()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_text, outputs=output)

# Compile the model
model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)
model.summary()

Epoch 1/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 91ms/step - accuracy: 0.7346 - loss: 0.6709 - val_accuracy: 0.7191 - val_loss: 0.6316
Epoch 2/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step - accuracy: 0.7657 - loss: 0.6093 - val_accuracy: 0.7191 - val_loss: 0.6041
Epoch 3/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 78ms/step - accuracy: 0.7846 - loss: 0.5640 - val_accuracy: 0.7191 - val_loss: 0.5920
Epoch 4/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 81ms/step - accuracy: 0.7821 - loss: 0.5492 - val_accuracy: 0.7191 - val_loss: 0.5877
Epoch 5/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 73ms/step - accuracy: 0.7771 - loss: 0.5483 - val_accuracy: 0.7191 - val_loss: 0.5869
Epoch 6/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.7566 - loss: 0.5583 - val_accuracy: 0.7191 - val_loss: 0.5874
Epoch 7/10
[1m34/34[0m [32m━━━━

In [13]:
# predict accuracy on x_val
y_pred = model.predict(X_val)

# print accuracy

print("Accuracy: ", np.mean((y_pred > 0.5).astype(int) == y_val))

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 98ms/step
Accuracy:  0.7191011235955056


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load csv file
import pandas as pd

data = pd.read_csv('MT.csv')

# train test split
from sklearn.model_selection import train_test_split

mt_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


# Data Preparation
def yield_tokens(data):
    for sentence in data:
        yield sentence.split()

# Assuming 'mt_data' is your loaded dataset containing 'src', 'hyp', and 'label'
vocab = build_vocab_from_iterator(yield_tokens(mt_data['tgt'].tolist() + mt_data['hyp'].tolist()), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])

def encode(text, vocab, max_len):
    tokens = ["<bos>"] + text.split() + ["<eos>"]
    enc = [vocab[token] for token in tokens][:max_len]
    return torch.tensor(enc, dtype=torch.long)

max_length = 50
src_encoded = [encode(text, vocab, max_length) for text in mt_data['tgt']]
hyp_encoded = [encode(text, vocab, max_length) for text in mt_data['hyp']]
src_padded = pad_sequence(src_encoded, batch_first=True, padding_value=vocab["<pad>"])
hyp_padded = pad_sequence(hyp_encoded, batch_first=True, padding_value=vocab["<pad>"])
inputs = torch.cat((src_padded, hyp_padded), dim=1)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(mt_data['label'])
labels = torch.tensor(labels, dtype=torch.long)

train_inputs, val_inputs, train_labels, val_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)
train_data = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_data, shuffle=True, batch_size=32)
val_data = TensorDataset(val_inputs, val_labels)
val_loader = DataLoader(val_data, shuffle=False, batch_size=32)

# Model Definition
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size, 1)

    def forward(self, outputs):
        attn_weights = torch.softmax(self.attn(outputs).squeeze(2), dim=1)
        context_vector = torch.bmm(outputs.transpose(1, 2), attn_weights.unsqueeze(2)).squeeze(2)
        return context_vector

class LSTMWithAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.attention = Attention(hidden_size)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        attn_out = self.attention(lstm_out)
        return self.fc(attn_out)

# Initialization
vocab_size = len(vocab)
embedding_dim = 100
model = LSTMWithAttention(vocab_size, embedding_dim, hidden_size=128, num_layers=1, num_classes=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Epoch {epoch+1}, Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%')




Epoch 1, Loss: 0.47746960180146353, Accuracy: 82.24299065420561%
Epoch 2, Loss: 0.4812048247882298, Accuracy: 82.24299065420561%
Epoch 3, Loss: 0.46919757979256765, Accuracy: 82.71028037383178%
Epoch 4, Loss: 0.47248134868485586, Accuracy: 79.4392523364486%
Epoch 5, Loss: 0.47714718324797495, Accuracy: 78.97196261682242%
Epoch 6, Loss: 0.523866091455732, Accuracy: 77.10280373831776%
Epoch 7, Loss: 0.659660381930215, Accuracy: 76.6355140186916%
Epoch 8, Loss: 0.7461389473506382, Accuracy: 72.42990654205607%
Epoch 9, Loss: 0.7817022204399109, Accuracy: 77.10280373831776%
Epoch 10, Loss: 0.8525574164731162, Accuracy: 76.16822429906541%


In [18]:
# predict on test

test_src_encoded = [encode(text, vocab, max_length) for text in test_data['tgt']]
test_hyp_encoded = [encode(text, vocab, max_length) for text in test_data['hyp']]
test_src_padded = pad_sequence(test_src_encoded, batch_first=True, padding_value=vocab["<pad>"])
test_hyp_padded = pad_sequence(test_hyp_encoded, batch_first=True, padding_value=vocab["<pad>"])

test_inputs = torch.cat((test_src_padded, test_hyp_padded), dim=1)
test_labels = label_encoder.transform(test_data['label'])

test_data = TensorDataset(test_inputs, torch.tensor(test_labels, dtype=torch.long))

test_loader = DataLoader(test_data, shuffle=False, batch_size=32)

model.eval()

correct = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {100 * correct / len(test_data)}%')


Test Accuracy: 68.53932584269663%


In [19]:
# layers of model

print(model)

LSTMWithAttention(
  (embedding): Embedding(2453, 100)
  (lstm): LSTM(100, 128, batch_first=True)
  (attention): Attention(
    (attn): Linear(in_features=128, out_features=1, bias=True)
  )
  (fc): Linear(in_features=128, out_features=2, bias=True)
)


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('MT.csv')  # Make sure to update the path accoradingly

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(data['src'].tolist() + data['hyp'].tolist(), truncation=True, padding=True, max_length=128)

# Extracting input_ids and attention_masks for splitting
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# Split data into training and validation sets for both inputs and labels
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, data['label'].tolist(), test_size=0.1, random_state=42
)
train_masks, val_masks, _, _ = train_test_split(
    attention_mask, data['label'].tolist(), test_size=0.1, random_state=42
)

# Prepare the dataset class
class TranslationDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx]),
            'labels': torch.tensor(self.labels[idx])
        }
        return item

    def __len__(self):
        return len(self.labels)

# Prepare pytorch datasets
train_dataset = TranslationDataset(train_inputs, train_masks, train_labels)
val_dataset = TranslationDataset(val_inputs, val_masks, val_labels)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss {loss.item()}")

# Validation
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=-1)
        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(batch['labels'].numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f'Validation Accuracy: {accuracy}')


ValueError: Found input variables with inconsistent numbers of samples: [2668, 1334]

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dropout, Conv1D, Bidirectional, LSTM, Dense
from tensorflow.keras.layers import Layer, MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K
import numpy as np

class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")        
        super(CustomAttention, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# Load the dataset
data = pd.read_csv('train.model-agnostic_labeled.csv')
data.fillna('', inplace=True)

# Prepare text data for tokenization
texts = pd.concat([data['src'], data['hyp'], data['tgt']])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert text columns to sequences of integers
hyp_sequences = tokenizer.texts_to_sequences(data['hyp'])
tgt_sequences = tokenizer.texts_to_sequences(data['tgt'])
src_sequences = tokenizer.texts_to_sequences(data['src'])

max_len_hyp = max(len(seq) for seq in hyp_sequences)
max_len_tgt = max(len(seq) for seq in tgt_sequences)
max_len_src = max(len(seq) for seq in src_sequences)
max_len = max(max_len_hyp, max_len_tgt, max_len_src)

hyp_padded = pad_sequences(hyp_sequences, maxlen=max_len)
tgt_padded = pad_sequences(tgt_sequences, maxlen=max_len)
src_padded = pad_sequences(src_sequences, maxlen=max_len)

# Combine hyp, tgt, and src sequences
X_combined = np.concatenate([hyp_padded, tgt_padded, src_padded], axis=1)

# Prepare labels
labels = data['label'].values

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

# Model definition
input_text = Input(shape=(3 * max_len,))
x = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100)(input_text)
x = Conv1D(64, 5, activation='relu')(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = MultiHeadAttention(num_heads=2, key_dim=128)(x, x)
x = CustomAttention()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_text, outputs=output)

# Compile the model
model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)
model.summary()


  data.fillna('', inplace=True)


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 476ms/step - accuracy: 0.5502 - loss: 0.6923 - val_accuracy: 0.5150 - val_loss: 0.6915
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 430ms/step - accuracy: 0.5749 - loss: 0.6895 - val_accuracy: 0.5150 - val_loss: 0.6906
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 435ms/step - accuracy: 0.5663 - loss: 0.6866 - val_accuracy: 0.5150 - val_loss: 0.6900
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 427ms/step - accuracy: 0.5434 - loss: 0.6901 - val_accuracy: 0.5150 - val_loss: 0.6899
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 420ms/step - accuracy: 0.5373 - loss: 0.6905 - val_accuracy: 0.5150 - val_loss: 0.6899
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 423ms/step - accuracy: 0.5545 - loss: 0.6870 - val_accuracy: 0.5150 - val_loss: 0.6901
Epoch 7/10
[1m25/25[

In [23]:
# predict accuracy on x_val
y_pred = model.predict(X_val)

# print accuracy

print("Accuracy: ", np.mean((y_pred > 0.5).astype(int) == y_val))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 208ms/step
Accuracy:  0.515
