In [1]:
import numpy as np
import pandas as pd

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
from tqdm import tqdm

In [4]:
from adopt import ADOPT


In [5]:
from datasets import load_dataset

In [6]:
# torch.set_float32_matmul_precision('high')

In [7]:
import re
def extract_num_output(text):
    match = re.search(r'(?<=The answer is:\s).*$', text)
    if match:
        return match.group(0)
    return None

In [8]:
cache_dir = "data_cache"
model_dir = "model_cache"
ds = load_dataset("meta-math/MetaMathQA", cache_dir=cache_dir)
sen_ds = load_dataset("sentence-transformers/wikipedia-en-sentences",cache_dir=cache_dir)

In [None]:
df = pd.DataFrame(ds['train']) 
sen_df = sen_ds['train'].to_pandas()

In [None]:
df.head()

Unnamed: 0,type,query,original_question,response
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an..."
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa..."
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th..."
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ..."


In [None]:
sen_df.head()

Unnamed: 0,sentence
0,"The film stars M. G. Ramachandran, Latha, Anja..."
1,Naarda plenirena is a species of moth in the f...
2,Sponsored by the American Federation of Labor ...
3,Since that election the Belfast Corporation Ac...
4,It was also included on their Best of Volume 1.


In [None]:
df['Numerical_output']= df['response'].apply(extract_num_output)

In [None]:
df

Unnamed: 0,type,query,original_question,response,Numerical_output
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an...",\sqrt{5}
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa...",752
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th...",1
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...,91
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ...",1
...,...,...,...,...,...
394995,GSM_FOBAR,"Yesterday, David and William were invited to a...","Yesterday, David and William were invited to a...",David broke 2 glasses.\nHis friend William bro...,4
394996,MATH_AnsAug,Suppose $\sin N = \frac{2}{3}$ in the diagram ...,Suppose $\sin N = \frac{2}{3}$ in the diagram ...,We can use the Pythagorean Theorem to find $LN...,24
394997,GSM_FOBAR,Jeff orders a Halloween costume. He has to pu...,Jeff orders a Halloween costume. He has to pu...,The costume cost 40% more than last year's cos...,250
394998,MATH_AnsAug,The average age of the 10 females in a choir i...,The average age of the 10 females in a choir i...,The sum of the ages of the 10 females is $10 \...,33


In [None]:
df.isna().sum()

type                 0
query                0
original_question    0
response             0
Numerical_output     0
dtype: int64

# LLM loading

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2ForCausalLM

In [None]:
torch.set_float32_matmul_precision('high')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

In [None]:
# Load tokenizer and encoder-only model
tokenizer_lm = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct", cache_dir="model_cache", padding_side='left')
model_lm = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct", cache_dir="model_cache").to(device)

In [None]:
batch_size = 1024
max_tokens = 100
embedding_lenght = 1536

In [None]:
split=1
split_size = 1000000

In [None]:
X_train_query = df["query"]
sen_train = sen_df['sentence'].sample(n=split_size, random_state=42)

In [None]:
data_array = X_train_query.to_numpy()
sen_array = sen_train.to_numpy()

# Split the data into train and test sets
X_train_math, X_test = train_test_split(data_array, test_size=0.2, random_state=42)

In [None]:
X_train = np.concatenate([X_train_math, sen_array])

In [None]:
# Create DataLoaders
train_loader = DataLoader(X_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(X_test, batch_size=batch_size, shuffle=False)

In [None]:
import torch
import torch.nn as nn

In [None]:
def count_parameters(model: nn.Module):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    return trainable_params, non_trainable_params

In [None]:
qwen_embedding_layer=model_lm.get_input_embeddings()

In [None]:
def get_qwen_embeddings(texts):
    template = "<|im_start|>{text}<|im_end|>"
    texts = [template.format(text=text) for text in texts]
    tokens = tokenizer_lm(
            texts,
            return_tensors='pt',
            truncation=True,
            padding=True,
            max_length=100
        ).to(device)
    with torch.no_grad():
        embeddings = qwen_embedding_layer(tokens.input_ids)
    return embeddings, tokens.attention_mask

In [None]:
def get_losses(criteria, outputs, targets, target_attention_mask, weight):
    pad_attention_mask = 1-target_attention_mask

    attention_targets = targets * target_attention_mask.unsqueeze(-1)
    pad_targets = targets * pad_attention_mask.unsqueeze(-1)

    attention_outputs = outputs * target_attention_mask.unsqueeze(-1)
    pad_outputs = outputs * pad_attention_mask.unsqueeze(-1)

    attention_loss = criteria(attention_outputs, attention_targets)
    pad_loss = criteria(pad_outputs, pad_targets)

    weighted_loss = attention_loss * weight + pad_loss * (1-weight)

    return weighted_loss, attention_loss, pad_loss

In [None]:
def save_checkpoint(epoch, model, optimizer, loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, "checkpoints/"+path)

In [None]:
# Training function with GPU support
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20, warm_ups=5, w=0.9):
    
    print(f"Using device: {device}")
    
    best_val_loss = np.inf
    
    # Move the model to the selected device (GPU or CPU)
    model.to(device)
    
    for epoch in range(num_epochs+2*warm_ups):
        model.train()

        train_loss = 0.0
        train_attention_loss = 0.0
        train_pad_loss = 0.0

        val_loss = 0.0
        val_attention_loss = 0.0
        val_pad_loss = 0.0
        
        model_saved_at_epoch = False

        warm_up = epoch < warm_ups
        warm_downs = epoch >= (num_epochs - warm_ups)

        if not(warm_up or warm_downs):
            model.freeze_wrapper()
        else:
            model.freeze_wrapper(False)
        
        # Training phase
        for inputs in tqdm(train_loader, desc = f'epoch_{epoch+1}/{num_epochs}_warm_up_{warm_up}_warm_down_{warm_downs}'):
            
            embeddings, attention_mask = get_qwen_embeddings(inputs)

            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            
            outputs = model(embeddings,embeddings, attention_mask=attention_mask, short_circuit=warm_up)
            weighted_loss, attention_loss, pad_loss = get_losses(criterion, outputs, embeddings, attention_mask, w)

            if torch.isnan(weighted_loss) or torch.isinf(weighted_loss):
                print(f"Numerical instability detected in training. Skipping this batch.")
                continue
            
            # Backward pass and optimization
            weighted_loss.backward()
            optimizer.step()
            
            train_loss += weighted_loss.item() * embeddings.size(0)  # Accumulate training loss
            train_attention_loss += attention_loss.item() * embeddings.size(0)
            train_pad_loss += pad_loss.item() * embeddings.size(0)
            
        # Validation phase
        model.eval()
        with torch.no_grad():
            for inputs in tqdm(val_loader):
                embeddings, attention_mask = get_qwen_embeddings(inputs)

                outputs = model(embeddings,embeddings, attention_mask=attention_mask, short_circuit=warm_up)
                weighted_loss, attention_loss, pad_loss = get_losses(criterion, outputs, embeddings, attention_mask, w)

                val_loss += weighted_loss.item() * embeddings.size(0)  # Accumulate validation loss
                val_attention_loss += attention_loss.item() * embeddings.size(0)
                val_pad_loss += pad_loss.item() * embeddings.size(0)
                
        
        # Calculate average losses
        train_loss /= len(train_loader.dataset)
        train_attention_loss /= len(train_loader.dataset)
        train_pad_loss /= len(train_loader.dataset)

        val_loss /= len(val_loader.dataset)
        val_attention_loss /= len(val_loader.dataset)
        val_pad_loss /= len(val_loader.dataset)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_checkpoint(epoch, model, optimizer, best_val_loss, "best_model.pth")
            print(f"Best model saved with loss: {best_val_loss:.7f} at epoch {epoch+1}/{num_epochs+2*warm_ups} warm_up_{warm_up} warm_down_{warm_downs}")
            model_saved_at_epoch = True
        
        # Print losses
        log_line = f"Epoch {epoch+1}/{num_epochs+2*warm_ups}, Train Loss: {train_loss:.7f}, Val Loss: {val_loss:.7f}, Train Attention Loss: {train_attention_loss:.7f}, Val Attention Loss: {val_attention_loss:.7f}, Train Pad Loss: {train_pad_loss:.7f}, Val Pad Loss: {val_pad_loss:.7f}"
        print(log_line)
        with open("logs/logs.txt", "a") as log_file:
            log_file.write(log_line + f" model_saved {model_saved_at_epoch}"+ "\n")

In [None]:
class LinearAutoencoder(nn.Module):
    def __init__(self):
        super(LinearAutoencoder, self).__init__()
        
        # Flatten the input
        self.flatten = nn.Flatten()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(100 * 1536, 1024*2),  # Input size: 200*1536, Output size: 1024
            nn.LeakyReLU(0.1),
            nn.Linear(1024*2,1024),
            # nn.LeakyReLU(0.001),
            # nn.Linear(1024*2, 1024)             # Bottleneck size: 64
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            # nn.Linear(1024, 1024*2),             # Input size: 64, Output size: 256
            # nn.LeakyReLU(0.001),
            nn.Linear(1024,1024*2),
            nn.LeakyReLU(0.1),
            nn.Linear(1024*2, 100 * 1536),    # Output size: 200*1536
            nn.Tanh()                  #
        )

    def forward(self, x):
        x = self.flatten(x)  # Flatten the input tensor
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded.view(-1, 100, 1536)  # Reshape to original image size

In [None]:
class CNNAutoencoder(nn.Module):
    def __init__(self):
        super(CNNAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 2, kernel_size=3, stride=2, padding=1),  # Output: (16, 100, 768)
            nn.ReLU(),
            nn.Conv2d(2, 4, kernel_size=3, stride=2, padding=1),  # Output: (32, 50, 384)
            nn.ReLU(),
            nn.Conv2d(4, 8, kernel_size=3, stride=2, padding=1),  # Output: (64, 25, 192)
            nn.ReLU(),
            nn.Conv2d(8, 1, kernel_size=3, stride=2, padding=1), # Output: (128, 13, 96)
            nn.ReLU(),
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(1, 8, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),  # Output: (64, 25, 192)
            nn.ReLU(),
            nn.ConvTranspose2d(8, 4, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),  # Output: (32, 50, 384)
            nn.ReLU(),
            nn.ConvTranspose2d(4, 2, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),  # Output: (16, 100, 768)
            nn.ReLU(),
            nn.ConvTranspose2d(2, 1, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),   # Output: (1, 200, 1536)
            nn.Tanh()  # Output between -1 and 1 for normalized inputs
        )

        

    def forward(self, x):
        # Add channel dimension: shape becomes [batch_size, 1, 200, 1536]
        x = x.unsqueeze(1)
        
        # Encoder
        encoded = self.encoder(x)
        
        # Decoder
        decoded = self.decoder(encoded)
        
        # Remove channel dimension to match the original input shape: [batch_size, 200, 1536]
        decoded = decoded.squeeze(1)
        
        return decoded

In [None]:
class Conv1dAutoencoder(nn.Module):
    def __init__(self):
        super(Conv1dAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            # First layer to reduce to (100, 800)
            nn.Conv1d(in_channels=1536, out_channels=1536//2, kernel_size=1),
            nn.BatchNorm1d(1536//2),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            nn.Conv1d(in_channels=1536//2, out_channels=1536//4, kernel_size=1),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            # Second layer to reduce to (100, 100)
            nn.Conv1d(in_channels=1536//4, out_channels=200, kernel_size=1),
            nn.LeakyReLU(0.1),
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            
            # First layer to expand back to (100, 800)
            nn.ConvTranspose1d(in_channels=200, out_channels=1536//4, kernel_size=1),
            nn.BatchNorm1d(1536//4),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            nn.ConvTranspose1d(in_channels=1536//4, out_channels=1536//2, kernel_size=1),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            # Second layer to expand back to (100, 1536)
            nn.ConvTranspose1d(in_channels=1536//2, out_channels=1536, kernel_size=1),
            nn.Tanh()  # Use Tanh to output values in the range [-1, 1]
        )
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(100*200, 2048)
        self.fc2 = nn.Linear(2048, 100*200)
        
    def forward(self, x):
        # Transpose to (batch_size, in_channels, sequence_length)
        x = x.transpose(1, 2)  # Shape becomes (batch_size, 1536, 100)
        
        # Encode
        encoded = self.encoder(x)  # Shape becomes (batch_size, 100, 100)

        encoded_flatten = self.flatten(encoded)

        bottle_neck = self.fc1(encoded_flatten)

        decoded_fc = self.fc2(bottle_neck)

        encoded = decoded_fc.view(-1, 200, 100)
        
        # Decode
        decoded = self.decoder(encoded)  # Shape becomes (batch_size, 1536, 100)
        
        # Transpose back to (batch_size, sequence_length, feature_dimension)
        decoded = decoded.transpose(1, 2)  # Final shape (batch_size, 100, 1536)
        
        return decoded

In [None]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
    
    def forward(self, x, attention_mask=None):
        if attention_mask is not None:
            x = x* attention_mask.unsqueeze(-1)
        # x shape: (batch_size, seq_len, input_size)
        _, (_, cell) = self.lstm(x)
        # cell shape: (num_layers, batch_size, hidden_size)
        return cell

In [None]:
class LSTMDecoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers, dropout):
        super(LSTMDecoder, self).__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(output_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, output_size),
            nn.Tanh()
        )
    
    def forward(self, x, prev_state):
        # x shape: (batch_size, 1, output_size)
        # prev_state is a tuple of (hidden, cell) with shape (num_layers, batch_size, hidden_size)
        output, (hidden, cell) = self.lstm(x, prev_state)
        # output shape: (batch_size, 1, hidden_size)
        # new_cell shape: (num_layers, batch_size, hidden_size)
        output = self.fc(output[:, -1, :])
        # output shape: (batch_size, output_size)
        return output, (hidden, cell)

In [None]:
class LSTMSeq2Seq(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, max_tokens, num_layers, dropout=0):
        super(LSTMSeq2Seq, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.max_tokens = max_tokens
        self.dropout = dropout
        self.encoder = LSTMEncoder(self.input_size, self.hidden_size, self.num_layers, self.dropout)
        self.decoder = LSTMDecoder(self.output_size, self.hidden_size, self.num_layers, self.dropout)
    
    def forward(self, source, target=None, attention_mask=None, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        device = source.device

        cell = self.encoder(source,attention_mask=attention_mask)

        decoder_input = torch.zeros(batch_size, 1, self.output_size, device=device)
        hidden = torch.zeros(self.decoder.num_layers, batch_size, self.decoder.hidden_size, device=device)
        
        outputs = []

        # Teacher forcing
        for t in range(self.max_tokens):
            decoder_output, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            outputs.append(decoder_output)
            
            if target is not None:
                # Teacher forcing
                teacher_force = torch.rand(1).item() < teacher_forcing_ratio
                decoder_input = target[:, t].unsqueeze(1) if teacher_force else decoder_output.unsqueeze(1)
            else:
                # Inference mode
                decoder_input = decoder_output.unsqueeze(1)
        
        outputs = torch.stack(outputs, dim=1)
        # outputs shape: (batch_size, target_len, output_size)
        
        return outputs

In [None]:
class AdvancedSeqDimReducer(nn.Module):
    def __init__(self, input_dim, target_dim, kernel_size=1):
        super(AdvancedSeqDimReducer, self).__init__()
        self.kernel_size = kernel_size
        self.padding_size = (kernel_size - 1) // 2
        
        self.reducer = nn.Sequential(
            nn.Conv1d(input_dim, input_dim//2, kernel_size=self.kernel_size, padding=self.padding_size),
            nn.BatchNorm1d(input_dim//2),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            nn.Conv1d(input_dim//2, input_dim//4, kernel_size=self.kernel_size, padding=self.padding_size),
            # nn.BatchNorm1d(input_dim//4),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            nn.Conv1d(input_dim//4, target_dim, kernel_size=self.kernel_size, padding=self.padding_size),
            nn.LeakyReLU(0.1),
        )
    
    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.reducer(x)
        return x.transpose(1, 2)

In [None]:
class AdvancedSeqReconstructor(nn.Module):
    def __init__(self, compressed_dim, target_dim, kernel_size):
        super(AdvancedSeqReconstructor, self).__init__()
        self.kernel_size = kernel_size
        self.padding_size = (kernel_size - 1) // 2
        
        self.reconstructor = nn.Sequential(
            # First upsampling: compressed_dim → target_dim//4
            nn.ConvTranspose1d(compressed_dim, target_dim//4, kernel_size=self.kernel_size, padding=self.padding_size),
            nn.BatchNorm1d(target_dim//4),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            # Second upsampling: target_dim//4 → target_dim//2
            nn.ConvTranspose1d(target_dim//4, target_dim//2, kernel_size=self.kernel_size, padding=self.padding_size),
            # nn.BatchNorm1d(target_dim//2),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            # Final upsampling: target_dim//2 → target_dim
            nn.ConvTranspose1d(target_dim//2, target_dim, kernel_size=self.kernel_size, padding=self.padding_size),
            nn.Tanh(),
        )

    def forward(self, x):
        # Transpose for ConvTranspose1d operation
        x = x.transpose(1, 2)  # (batch_size, compressed_dim, sequence_length)
        
        # Apply reconstruction
        x = self.reconstructor(x)
        
        # Transpose back to original format
        return x.transpose(1, 2)  # (batch_size, sequence_length, target_dim)

In [None]:
class CNNWrapper(nn.Module):
    def __init__(self, input_dim,compressed_dim, hidden_dim, target_dim, kernel_size, num_layers, max_tokens,dropout=0):
        super(CNNWrapper, self).__init__()
        self.kernel_size = kernel_size
        self.padding = (kernel_size - 1) / 2
        self.encoder = AdvancedSeqDimReducer(input_dim,compressed_dim, kernel_size)
        self.decoder = AdvancedSeqReconstructor(compressed_dim,target_dim, kernel_size)
        self.model = LSTMSeq2Seq(compressed_dim, hidden_dim, compressed_dim, max_tokens, num_layers, dropout)
    
    def forward(self, source, target=None, attention_mask=None, teacher_forcing_ratio=0.5, short_circuit=False):
        if short_circuit:
            encoded = self.encoder(source)
            decoded = self.decoder(encoded)
            return decoded
        else:
            compressed = self.encoder(source)
            if target is not None:
                outputs = self.model(compressed, compressed, attention_mask, teacher_forcing_ratio)
            else:
                outputs = self.model(compressed, None, attention_mask, teacher_forcing_ratio)
            reconstructed = self.decoder(outputs)
            return reconstructed
        
    def freeze_wrapper(self,freeze=True):
        for param in self.encoder.parameters():
            param.requires_grad = not freeze
        for param in self.decoder.parameters():
            param.requires_grad = not freeze
        


In [None]:
model = CNNWrapper(1536, 200, 768, 1536, 1, 1, max_tokens, 0)

# Loss function
criterion = nn.MSELoss()

# Optimizer (Adam)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)

In [None]:
count_parameters(model)

(9222544, 0)

In [None]:
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=30)

Using device: cuda


epoch_1/30_warm_up_True_warm_down_False_loss_0.0000000:   0%|          | 0/1286 [00:00<?, ?it/s]

epoch_1/30_warm_up_True_warm_down_False_loss_0.0000000:  30%|██▉       | 381/1286 [01:04<02:33,  5.88it/s]


KeyboardInterrupt: 

: 

In [None]:
manual_test = X_test[7]

In [None]:
manual_test

'If Micah can type 20 words per minute and Isaiah can type 40 words per minute, what is the difference in the number of words they can type in an hour?'

In [None]:
def get_embeddings(inputs):
    encoded_input = tokenizer_lm(
        inputs,
        max_length=max_tokens,   # Set the fixed length
        padding='max_length', # Pad to max length
        truncation=True,    # Truncate if longer than max length
        return_tensors='pt' # Return as PyTorch tensors
    )

    # Move input IDs to the appropriate device
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    embeddings = model_lm.get_input_embeddings()(input_ids)
    return embeddings, attention_mask
    

In [None]:
def get_embeddings_decoder(inputs):
    encoded_input = tokenizer_lm(
        inputs,
        max_length=max_tokens,   # Set the fixed length
        padding='max_length', # Pad to max length
        truncation=True,    # Truncate if longer than max length
        return_tensors='pt' # Return as PyTorch tensors
    )

    # Move input IDs to the appropriate device
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    embeddings = model_lm.get_input_embeddings()(input_ids)
    model.eval()
    with torch.no_grad():
    # Get the output (predictions)
        output = model(embeddings)
    return output, attention_mask

In [None]:
def get_generation(embeddings, attention_mask):
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs= model_lm.generate(
            input_ids=None,
            inputs_embeds=embeddings,
            attention_mask=attention_mask,
            pad_token_id=tokenizer_lm.pad_token_id,  # Padding token ID
            eos_token_id=tokenizer_lm.eos_token_id,  # End-of-sequence token ID
            no_repeat_ngram_size=2,
        )
    decoded_texts = tokenizer_lm.batch_decode(outputs, skip_special_tokens=True)
    return decoded_texts

In [None]:
original_embeddings = get_embeddings(manual_test)

In [None]:
(original_embeddings[0][0][0]**2).mean()

tensor(0.0007, device='cuda:0', grad_fn=<MeanBackward0>)

In [None]:
np.sqrt(0.0014)

0.03741657386773942

In [None]:
gen_embeddings = get_embeddings_decoder(manual_test)

TypeError: CNNWrapper.forward() missing 1 required positional argument: 'target'

In [None]:
(gen_embeddings[0][0][0]**2).mean()

tensor(0.0014, device='cuda:0')

In [None]:
get_generation(*original_embeddings)

[" To determine the difference in the number of words Micah and Isaiah can type in an hour, we need to calculate the words each can produce in that time and then find the absolute difference between these two values.\n\n1. Calculate the total number_of_words Miciah can types in one hour.\n2. Similarly, calculate for Isaiah.\n3. Find the positive difference of the two results.\n\nLet's do this step-by-step using Python code.\n```python\n# Constants\nmicah_typing_speed = 20  # words per minute\nisaiah_typicing_speed   =   40   #  words  per  minute\n\n# Time in minutes\ntime_in_minutes =    60    # one  hour\n\nmiciah_total_words = micahTypingSpeed * time_inMinutes\nisiahTotalWords = isaiahTypicingSpeed  *  timeInMinutes\n\ndifference = abs(micahTotalWord - isiahtotalWord)\nprint(difference)\n```\n```output\nNameError: name 'micAHTypiIngSpeed' is not defined\n``\nIt seems there was a typo in variable names. Let's correct it and run the code again.\n```\n\nReach max function call limit."]

In [None]:
get_generation(gen_embeddings[0],gen_embeddings[1])

[" To find the difference in the number of words they can type in an hour, we first need to calculate the total number each can write in 60 minutes.\n\nFor John, who can make 20 words per minute, the calculation is:\n2 * 10 * (6 * x) = 300\n\nFor Jordan, with a rate of 40 to 50, let's use the average rate for simplicity:\n(45 + 75) / 90 = (4 * y) + (5 * z)\n\nSolving for y and z, which represent the time in minutes Jordan spends at each rate, gives us:\ny = z = x\n\nSince Jordan's average time is 0.5 minutes, Jordan can complete 80% of the words in one hour.\n\nNow, to find out how many more words Jordan types than John in a minute:\n48 -   = ?\n\nTo find how much more Jordan writes in total in that hour:\n8 * [4 - (2/3)] = [8/15] * words\n\nTherefore, in terms of total words, John types  [24/55], and Jordan  [(8*12)/11] more than him in each hour."]

In [None]:
get_generation(*original_embeddings)

[" To determine the difference in the number of words Micah and Isaiah can type in an hour, we need to calculate the words each can produce in that time and then find the absolute difference between these two values.\n\n1. Calculate the total number_of_words Miciah can types in one hour.\n2. Similarly, calculate for Isaiah.\n3. Find the positive difference of the two results.\n\nLet's do this step-by-step using Python code.\n```python\n# Constants\nmicah_typing_speed = 20  # words per minute\nisaiah_typicing_speed   =   40   #  words  per  minute\n\n# Time in minutes\ntime_in_minutes =    60    # one  hour\n\nmiciah_total_words = micahTypingSpeed * time_inMinutes\nisiahTotalWords = isaiahTypicingSpeed  *  timeInMinutes\n\ndifference = abs(micahTotalWord - isiahtotalWord)\nprint(difference)\n```\n```output\nNameError: name 'micAHTypiIngSpeed' is not defined\n``\nIt seems there was a typo in variable names. Let's correct it and run the code again.\n```\n\nReach max function call limit."]

In [None]:
get_generation(*original_embeddings)

In [None]:
def save_model(model, filename='model.pth'):
    torch.save(model.state_dict(), filename)
    print(f"Model saved to {filename}")

In [None]:
save_model(model, 'AutoEncoderMSE1.pth')

Model saved to AutoEncoderMSE1.pth


: 

In [None]:
# !gcloud compute instances stop ndr-a100-spot --zone us-central1-a