In [11]:
# this module takes in a model that has max_len = 600 and returns a model with max_len = 512
# the architecture of the model is the same but with a different max_len
# the weights of the model are the same as the original model

#This is the architecture of the model

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from util import *

max_len = 500
new_max_len = 1000

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=56, embedding_dim=25)

        # Add Conv1d layer
        self.conv1 = nn.Conv1d(in_channels=25, out_channels=256, kernel_size=11, padding=5)
        self.bn_conv1 = nn.BatchNorm1d(max_len)
        self.dropout_conv1 = nn.Dropout(p=0.5)

        self.lstm1 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True, bidirectional=True)
        self.bn1 = nn.BatchNorm1d(max_len)
        self.dropout1 = nn.Dropout(p=0.5)

        self.conv2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, padding=2)
        self.bn_conv2 = nn.BatchNorm1d(max_len)
        self.dropout_conv2 = nn.Dropout(p=0.5)

        self.lstm2 = nn.LSTM(input_size=512, hidden_size=256, batch_first=True, bidirectional=True)
        self.bn2 = nn.BatchNorm1d(max_len)
        self.dropout2 = nn.Dropout(p=0.5)

        self.conv3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
        self.bn_conv3 = nn.BatchNorm1d(max_len)
        self.dropout_conv3 = nn.Dropout(p=0.5)

        self.dense1 = nn.Linear(in_features=512, out_features=512)
        self.bn3 = nn.BatchNorm1d(max_len)

        self.dense2 = nn.Linear(in_features=512, out_features=512)
        self.bn4 = nn.BatchNorm1d(max_len)
        
        self.dense3 = nn.Linear(in_features=512, out_features=15)

    def forward(self, x):
        x = self.embedding(x)

        # Apply Conv1d layer
        x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
        x = F.relu(self.conv1(x))
        x = x.permute(0, 2, 1)  # Change back to (batch_size, seq_len, channels)
        x = self.bn_conv1(x)
        x = self.dropout_conv1(x)

        x, _ = self.lstm1(x)
        x = self.bn1(x)
        x = self.dropout1(x)

        x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
        x = F.relu(self.conv2(x))
        x = x.permute(0, 2, 1)
        x = self.bn_conv2(x)
        x = self.dropout_conv2(x)

        x, _ = self.lstm2(x)
        x = self.bn2(x)
        x = self.dropout2(x)

        x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
        x = F.relu(self.conv3(x))
        x = x.permute(0, 2, 1)
        x = self.bn_conv3(x)
        x = self.dropout_conv3(x)

        x = F.relu(self.bn3(self.dense1(x)))
        x = F.relu(self.bn4(self.dense2(x)))

        x = self.dense3(x)
        return x
    

class NewModel(Model):
    def __init__(self, new_max_len, original_model_state_dict):
        super(NewModel, self).__init__()

        # Override batch norm layers with new max_len
        self.bn_conv1 = nn.BatchNorm1d(new_max_len)
        self.bn1 = nn.BatchNorm1d(new_max_len)
        self.bn_conv2 = nn.BatchNorm1d(new_max_len)
        self.bn2 = nn.BatchNorm1d(new_max_len)
        self.bn_conv3 = nn.BatchNorm1d(new_max_len)
        self.bn3 = nn.BatchNorm1d(new_max_len)
        self.bn4 = nn.BatchNorm1d(new_max_len)

        for param in self.embedding.parameters():
            param.requires_grad = False
        for param in self.lstm1.parameters():
            param.requires_grad = False
        for param in self.lstm2.parameters():
            param.requires_grad = False
        for param in self.dense1.parameters():
            param.requires_grad = False
        for param in self.dense2.parameters():
            param.requires_grad = False
        for param in self.dense3.parameters():
            param.requires_grad = False
        for param in self.conv1.parameters():
            param.requires_grad = False
        for param in self.conv2.parameters():
            param.requires_grad = False
        for param in self.conv3.parameters():
            param.requires_grad = False

        # Initialize with the original model's state_dict
        self.load_state_dict(original_model_state_dict)



    def forward(self, x):
            x = self.embedding(x)

            # Apply Conv1d layer
            x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
            x = F.relu(self.conv1(x))
            x = x.permute(0, 2, 1)  # Change back to (batch_size, seq_len, channels)
            x = self.bn_conv1(x)
            x = self.dropout_conv1(x)

            x, _ = self.lstm1(x)
            x = self.bn1(x)
            x = self.dropout1(x)

            x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
            x = F.relu(self.conv2(x))
            x = x.permute(0, 2, 1)
            x = self.bn_conv2(x)
            x = self.dropout_conv2(x)

            x, _ = self.lstm2(x)
            x = self.bn2(x)
            x = self.dropout2(x)

            x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
            x = F.relu(self.conv3(x))
            x = x.permute(0, 2, 1)
            x = self.bn_conv3(x)
            x = self.dropout_conv3(x)

            x = F.relu(self.bn3(self.dense1(x)))
            x = F.relu(self.bn4(self.dense2(x)))

            x = self.dense3(x)
            return x

# Load the original model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load('model_conv_m.ckpt')

for idx, module in enumerate([model.bn_conv1, model.bn1, model.bn_conv2, model.bn2, model.bn_conv3, model.bn3, model.bn4]):
    new_module = nn.BatchNorm1d(new_max_len)
    # new_module.weight.data[:max_len] = module.weight.data
    # new_module.bias.data[:max_len] = module.bias.data
    # new_module.running_mean.data[:max_len] = module.running_mean.data
    # new_module.running_var.data[:max_len] = module.running_var.data

    # new_module.weight.data[max_len:].fill_(module.weight.data[-1].item())
    # new_module.bias.data[max_len:].fill_(module.bias.data[-1].item())
    # new_module.running_mean.data[max_len:].fill_(module.running_mean.data[-1].item())
    # new_module.running_var.data[max_len:].fill_(module.running_var.data[-1].item())

    # nn.init.normal_(new_module.weight.data, mean=0, std=0.02)
    # nn.init.constant_(new_module.bias.data, 0.01)
    # nn.init.constant_(new_module.running_mean.data, 0)
    # nn.init.constant_(new_module.running_var.data, 1)

    # Replace the old module with the new module in the model
    if idx == 0:
        model.bn_conv1 = new_module
    elif idx == 1:
        model.bn1 = new_module
    elif idx == 2:
        model.bn_conv2 = new_module
    elif idx == 3:
        model.bn2 = new_module
    elif idx == 4:
        model.bn_conv3 = new_module
    elif idx == 5:
        model.bn3 = new_module
    elif idx == 6:
        model.bn4 = new_module

# Create the new model with the desired max_len and original model's state_dict
new_model = NewModel(new_max_len, model.state_dict())

# Set the new model to the device
new_model = new_model.to(device)

# Initialize the optimizer for the new model
optimizer = optim.Adam(new_model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-7)
criterion = nn.CrossEntropyLoss()
model = new_model
model = torch.load('model_conv_1000.ckpt')

In [12]:
train_dir = 'Dataset/train_clean.txt'
val_dir = 'Dataset/val_clean.txt'
test_dir = 'Dataset/test_clean.txt'
with open(train_dir, 'r', encoding='utf8') as f:
    train = f.read()
with open(val_dir, 'r', encoding='utf8') as f:
    val = f.read()
with open(test_dir, 'r', encoding='utf8') as f:
    test = f.read()
X_train, y_train = extract_features(train, new_max_len)
X_val, y_val = extract_features(val, new_max_len)
X_test, y_test = extract_features(test, new_max_len)

In [13]:
test_len = 50000
np.random.seed(59)
indices = np.arange(test_len)
np.random.shuffle(indices)
X_test = X_test[indices]
y_test = y_test[indices]

In [14]:
sentence_encoder = LabelEncoder().fit(X_train.flatten())
X_train = sentence_encoder.transform(X_train.flatten()).reshape(X_train.shape).astype(np.int16)
X_val = sentence_encoder.transform(X_val.flatten()).reshape(X_val.shape).astype(np.int16)
X_test = sentence_encoder.transform(X_test.flatten()).reshape(X_test.shape).astype(np.int16)

In [15]:
def batch_eval(x_np, y_np):
    X_tensor = torch.tensor(x_np, dtype=torch.int32)
    y_tensor = torch.tensor(y_np, dtype=torch.int32)
    model.eval()
    predictions = torch.zeros_like(y_tensor)

    #print whether X_tensor is on cpu or gpu

    with torch.no_grad():
        for i in range(0, len(X_tensor), batch_size):
            inputs = X_tensor[i:i+batch_size].to(device)
            outputs = model(inputs)
            _, predictions[i:i+batch_size] = torch.max(outputs.data, 2)
            del inputs, outputs

    return predictions.cpu().numpy()
def compute_DER(X, y):
    predictions = batch_eval(X, y)
    ignore = {PAD, SOS, EOS, '!', '«', ']', '[', '}', ':', '"', '-', '»', '؛', ')', '،', '؟', '(', '{', '/', ' '}
    cnt = 0
    for itm in ignore:
        cnt += np.sum(X == sentence_encoder.transform(np.array([ord(itm)]))[0])
    accuracy = (np.sum(predictions == y) - cnt) / (predictions.shape[0] * predictions.shape[1] - cnt)
    acc_t = np.sum(predictions == y) / (predictions.shape[0] * predictions.shape[1])
    return 1 - accuracy, acc_t

In [16]:
X_train = np.concatenate((X_train, X_val))
y_train = np.concatenate((y_train, y_val))

In [17]:
#convert to int16
X_train = X_train.astype(np.int16)
y_train = y_train.astype(np.int16)
X_test = X_test.astype(np.int16)
y_test = y_test.astype(np.int16)

In [18]:
batch_size = 50

In [19]:
# num_epochs = 20
# for epoch in range(num_epochs):
#     model.train()

#     X_train_tensor = torch.tensor(X_train, dtype=torch.int64).to(device)
#     y_train_tensor = torch.tensor(y_train, dtype=torch.int64).to(device)
#     train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
#     accuracy = 0
#     loss_cum_sum = 0
#     len_ = 0
#     for i, (x, y) in enumerate(train_loader):
#         optimizer.zero_grad()
#         outputs = model(x)
#         loss = criterion(outputs.view(-1, np.unique(y_train).shape[0]), y.view(-1))
#         loss.backward()
#         optimizer.step()
#         _, predicted = torch.max(outputs.data, 2)
#         accuracy += (predicted == y).sum().item()
#         loss_cum_sum += loss.item()
#         len_ += y.size(0) * y.size(1)
#         if (i + 1) % 1 == 0:
#             print('Step [{}/{}], Epoch [{}/{}], Loss: {:.4f}, Accuracy: {:.4f}'.format(i + 1, len(train_loader), epoch + 1, num_epochs, loss_cum_sum / (i + 1), accuracy / len_), end='\r')
#     print()
#     del X_train_tensor, y_train_tensor, train_dataset, train_loader
#     DER, acc_t = compute_DER(X_test, y_test)
#     print('Validation DER: {:.5f}, Validation Accuracy: {:.4f}'.format(DER, acc_t))

In [20]:
# torch.save(model, 'model_conv_1000.ckpt')

In [21]:
batch_size = 40
DER, acc_t = compute_DER(X_test, y_test)
print('DER = ', DER)
print('acc_t = ', acc_t)