In [1]:
import numpy as np
import pandas as pd

In [2]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
from tqdm import tqdm

In [4]:
from datasets import load_dataset

In [5]:
# torch.set_float32_matmul_precision('high')

In [6]:
import re
def extract_num_output(text):
    match = re.search(r'(?<=The answer is:\s).*$', text)
    if match:
        return match.group(0)
    return None

In [7]:
cache_dir = "data_cache"
model_dir = "model_cache"
ds = load_dataset("meta-math/MetaMathQA", cache_dir=cache_dir)

In [8]:
df = pd.DataFrame(ds['train']) 

In [9]:
df.head()

Unnamed: 0,type,query,original_question,response
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an..."
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa..."
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th..."
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ..."


In [10]:
df['Numerical_output']= df['response'].apply(extract_num_output)

In [11]:
df

Unnamed: 0,type,query,original_question,response,Numerical_output
0,MATH_AnsAug,Gracie and Joe are choosing numbers on the com...,Gracie and Joe are choosing numbers on the com...,"The distance between two points $(x_1,y_1)$ an...",\sqrt{5}
1,GSM_Rephrased,What is the total cost of purchasing equipment...,The treasurer of a football team must buy equi...,"Each player requires a $25 jersey, a $15.20 pa...",752
2,GSM_SV,Diego baked 12 cakes for his sister's birthday...,Diego baked 12 cakes for his sister's birthday...,"To solve this problem, we need to determine th...",1
3,MATH_AnsAug,Convert $10101_3$ to a base 10 integer.,Convert $10101_3$ to a base 10 integer.,$10101_3 = 1 \cdot 3^4 + 0 \cdot 3^3 + 1 \cdot...,91
4,GSM_FOBAR,"Sue works in a factory and every 30 minutes, a...","Sue works in a factory and every 30 minutes, a...","We know that every 30 minutes, a machine produ...",1
...,...,...,...,...,...
394995,GSM_FOBAR,"Yesterday, David and William were invited to a...","Yesterday, David and William were invited to a...",David broke 2 glasses.\nHis friend William bro...,4
394996,MATH_AnsAug,Suppose $\sin N = \frac{2}{3}$ in the diagram ...,Suppose $\sin N = \frac{2}{3}$ in the diagram ...,We can use the Pythagorean Theorem to find $LN...,24
394997,GSM_FOBAR,Jeff orders a Halloween costume. He has to pu...,Jeff orders a Halloween costume. He has to pu...,The costume cost 40% more than last year's cos...,250
394998,MATH_AnsAug,The average age of the 10 females in a choir i...,The average age of the 10 females in a choir i...,The sum of the ages of the 10 females is $10 \...,33


In [12]:
df.isna().sum()

type                 0
query                0
original_question    0
response             0
Numerical_output     0
dtype: int64

# LLM loading

In [13]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2ForCausalLM

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [15]:
device

device(type='cuda')

In [16]:
# Load tokenizer and encoder-only model
tokenizer_lm = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B", cache_dir=cache_dir, padding_side='left')
model_lm = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Math-1.5B", cache_dir=cache_dir).to(device)

In [17]:
batch_size = 1024
max_tokens = 100
embedding_lenght = 1536

In [18]:
split=1
split_size = 395000

In [19]:
X_train_query = df["query"][split_size*(split-1):split_size*(split)]

In [20]:
data_array = X_train_query.to_numpy()

# Split the data into train and test sets
X_train, X_test = train_test_split(data_array, test_size=0.2, random_state=42)

# Create DataLoaders
train_loader = DataLoader(X_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(X_test, batch_size=batch_size, shuffle=False)

In [21]:
import torch
import torch.nn as nn

In [22]:
# Training function with GPU support
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=20):
    print(f"Using device: {device}")
    
    # Move the model to the selected device (GPU or CPU)
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        val_loss = 0.0
        
        # Training phase
        for inputs in tqdm(train_loader, desc = f'epoch_{epoch+1}/{num_epochs}'):
            
            encoded_input = tokenizer_lm(
                inputs,
                max_length=max_tokens,   # Set the fixed length
                padding='max_length', # Pad to max length
                truncation=True,    # Truncate if longer than max length
                return_tensors='pt' # Return as PyTorch tensors
            )

            # Move input IDs to the appropriate device
            input_ids = encoded_input['input_ids'].to(device)
            with torch.no_grad():
                embeddings = model_lm.get_input_embeddings()(input_ids)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            
            outputs = model(embeddings)
            loss = criterion(outputs, embeddings)  # Compare the reconstructed output with the input
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * embeddings.size(0)  # Accumulate training loss
            del input_ids, embeddings
            torch.cuda.empty_cache()
            
        # Validation phase
        model.eval()
        with torch.no_grad():
            for inputs in val_loader:
                encoded_input = tokenizer_lm(
                    inputs,
                    max_length=max_tokens,   # Set the fixed length
                    padding='max_length', # Pad to max length
                    truncation=True,    # Truncate if longer than max length
                    return_tensors='pt' # Return as PyTorch tensors
                )

                # Move input IDs to the appropriate device
                input_ids = encoded_input['input_ids'].to(device)
                embeddings = model_lm.get_input_embeddings()(input_ids)
                
                outputs = model(embeddings)
                loss = criterion(outputs, embeddings)
                val_loss += loss.item() * embeddings.size(0)  # Accumulate validation loss
                
                del input_ids, embeddings
                torch.cuda.empty_cache()
        
        # Calculate average losses
        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        
        # Print losses
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.7f}, Val Loss: {val_loss:.7f}")


In [23]:
class LinearAutoencoder(nn.Module):
    def __init__(self):
        super(LinearAutoencoder, self).__init__()
        
        # Flatten the input
        self.flatten = nn.Flatten()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(100 * 1536, 1024*2),  # Input size: 200*1536, Output size: 1024
            nn.LeakyReLU(0.1),
            nn.Linear(1024*2,1024),
            # nn.LeakyReLU(0.001),
            # nn.Linear(1024*2, 1024)             # Bottleneck size: 64
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            # nn.Linear(1024, 1024*2),             # Input size: 64, Output size: 256
            # nn.LeakyReLU(0.001),
            nn.Linear(1024,1024*2),
            nn.LeakyReLU(0.1),
            nn.Linear(1024*2, 100 * 1536),    # Output size: 200*1536
            nn.Tanh()                  #
        )

    def forward(self, x):
        x = self.flatten(x)  # Flatten the input tensor
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded.view(-1, 100, 1536)  # Reshape to original image size

In [24]:
class CNNAutoencoder(nn.Module):
    def __init__(self):
        super(CNNAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 2, kernel_size=3, stride=2, padding=1),  # Output: (16, 100, 768)
            nn.ReLU(),
            nn.Conv2d(2, 4, kernel_size=3, stride=2, padding=1),  # Output: (32, 50, 384)
            nn.ReLU(),
            nn.Conv2d(4, 8, kernel_size=3, stride=2, padding=1),  # Output: (64, 25, 192)
            nn.ReLU(),
            nn.Conv2d(8, 1, kernel_size=3, stride=2, padding=1), # Output: (128, 13, 96)
            nn.ReLU(),
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(1, 8, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),  # Output: (64, 25, 192)
            nn.ReLU(),
            nn.ConvTranspose2d(8, 4, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),  # Output: (32, 50, 384)
            nn.ReLU(),
            nn.ConvTranspose2d(4, 2, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),  # Output: (16, 100, 768)
            nn.ReLU(),
            nn.ConvTranspose2d(2, 1, kernel_size=3, stride=2, padding=1, output_padding=(1, 1)),   # Output: (1, 200, 1536)
            nn.Tanh()  # Output between -1 and 1 for normalized inputs
        )

        

    def forward(self, x):
        # Add channel dimension: shape becomes [batch_size, 1, 200, 1536]
        x = x.unsqueeze(1)
        
        # Encoder
        encoded = self.encoder(x)
        
        # Decoder
        decoded = self.decoder(encoded)
        
        # Remove channel dimension to match the original input shape: [batch_size, 200, 1536]
        decoded = decoded.squeeze(1)
        
        return decoded

In [25]:
class Conv1dAutoencoder(nn.Module):
    def __init__(self):
        super(Conv1dAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            # First layer to reduce to (100, 800)
            nn.Conv1d(in_channels=1536, out_channels=800, kernel_size=1),
            nn.LeakyReLU(0.1),
            
            nn.Conv1d(in_channels=800, out_channels=400, kernel_size=1),
            nn.LeakyReLU(0.1),
            
            # Second layer to reduce to (100, 100)
            nn.Conv1d(in_channels=400, out_channels=200, kernel_size=1),
            nn.LeakyReLU(0.1),
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            
            # First layer to expand back to (100, 800)
            nn.ConvTranspose1d(in_channels=200, out_channels=400, kernel_size=1),
            nn.LeakyReLU(0.1),
            
            nn.ConvTranspose1d(in_channels=400, out_channels=800, kernel_size=1),
            nn.LeakyReLU(0.1),
            
            # Second layer to expand back to (100, 1536)
            nn.ConvTranspose1d(in_channels=800, out_channels=1536, kernel_size=1),
            nn.Tanh()  # Use Tanh to output values in the range [-1, 1]
        )
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(100*200, 2048)
        self.fc2 = nn.Linear(2048, 100*200)
        
    def forward(self, x):
        # Transpose to (batch_size, in_channels, sequence_length)
        x = x.transpose(1, 2)  # Shape becomes (batch_size, 1536, 100)
        
        # Encode
        encoded = self.encoder(x)  # Shape becomes (batch_size, 100, 100)

        encoded_flatten = self.flatten(encoded)

        bottle_neck = self.fc1(encoded_flatten)

        decoded_fc = self.fc2(bottle_neck)

        encoded = decoded_fc.view(-1, 200, 100)
        
        # Decode
        decoded = self.decoder(encoded)  # Shape becomes (batch_size, 1536, 100)
        
        # Transpose back to (batch_size, sequence_length, feature_dimension)
        decoded = decoded.transpose(1, 2)  # Final shape (batch_size, 100, 1536)
        
        return decoded

In [26]:
# Initialize the autoencoder model
model = Conv1dAutoencoder()

# Loss function
criterion = nn.MSELoss()

# Optimizer (Adam)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [27]:
train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=30)

Using device: cuda


epoch_1/30:   0%|          | 0/309 [00:00<?, ?it/s]

epoch_1/30: 100%|██████████| 309/309 [00:58<00:00,  5.30it/s]


Epoch 1/30, Train Loss: 0.0004924, Val Loss: 0.0004190


epoch_2/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 2/30, Train Loss: 0.0003341, Val Loss: 0.0002853


epoch_3/30: 100%|██████████| 309/309 [00:57<00:00,  5.35it/s]


Epoch 3/30, Train Loss: 0.0002598, Val Loss: 0.0002439


epoch_4/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 4/30, Train Loss: 0.0002260, Val Loss: 0.0002125


epoch_5/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 5/30, Train Loss: 0.0001987, Val Loss: 0.0001993


epoch_6/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 6/30, Train Loss: 0.0001784, Val Loss: 0.0001702


epoch_7/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 7/30, Train Loss: 0.0001665, Val Loss: 0.0001572


epoch_8/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 8/30, Train Loss: 0.0001542, Val Loss: 0.0001658


epoch_9/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 9/30, Train Loss: 0.0001465, Val Loss: 0.0001419


epoch_10/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 10/30, Train Loss: 0.0001399, Val Loss: 0.0001370


epoch_11/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 11/30, Train Loss: 0.0001354, Val Loss: 0.0001312


epoch_12/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 12/30, Train Loss: 0.0001289, Val Loss: 0.0001260


epoch_13/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 13/30, Train Loss: 0.0001379, Val Loss: 0.0001233


epoch_14/30: 100%|██████████| 309/309 [01:03<00:00,  4.83it/s]


Epoch 14/30, Train Loss: 0.0001212, Val Loss: 0.0001183


epoch_15/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 15/30, Train Loss: 0.0001170, Val Loss: 0.0001152


epoch_16/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 16/30, Train Loss: 0.0001134, Val Loss: 0.0001133


epoch_17/30: 100%|██████████| 309/309 [00:57<00:00,  5.35it/s]


Epoch 17/30, Train Loss: 0.0001102, Val Loss: 0.0001076


epoch_18/30: 100%|██████████| 309/309 [01:03<00:00,  4.83it/s]


Epoch 18/30, Train Loss: 0.0001071, Val Loss: 0.0001054


epoch_19/30: 100%|██████████| 309/309 [00:57<00:00,  5.36it/s]


Epoch 19/30, Train Loss: 0.0001069, Val Loss: 0.0001030


epoch_20/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 20/30, Train Loss: 0.0001020, Val Loss: 0.0001010


epoch_21/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 21/30, Train Loss: 0.0001002, Val Loss: 0.0000993


epoch_22/30: 100%|██████████| 309/309 [01:03<00:00,  4.83it/s]


Epoch 22/30, Train Loss: 0.0000984, Val Loss: 0.0000978


epoch_23/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 23/30, Train Loss: 0.0001009, Val Loss: 0.0000951


epoch_24/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 24/30, Train Loss: 0.0000951, Val Loss: 0.0000938


epoch_25/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 25/30, Train Loss: 0.0000939, Val Loss: 0.0000936


epoch_26/30: 100%|██████████| 309/309 [01:03<00:00,  4.83it/s]


Epoch 26/30, Train Loss: 0.0000927, Val Loss: 0.0000913


epoch_27/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 27/30, Train Loss: 0.0000916, Val Loss: 0.0000909


epoch_28/30: 100%|██████████| 309/309 [01:03<00:00,  4.84it/s]


Epoch 28/30, Train Loss: 0.0000906, Val Loss: 0.0000899


epoch_29/30: 100%|██████████| 309/309 [00:57<00:00,  5.37it/s]


Epoch 29/30, Train Loss: 0.0000897, Val Loss: 0.0000883


epoch_30/30: 100%|██████████| 309/309 [01:03<00:00,  4.83it/s]


Epoch 30/30, Train Loss: 0.0000886, Val Loss: 0.0000876


In [66]:
manual_test = X_test[7]

In [67]:
manual_test

'If Micah can type 20 words per minute and Isaiah can type 40 words per minute, what is the difference in the number of words they can type in an hour?'

In [68]:
def get_embeddings(inputs):
    encoded_input = tokenizer_lm(
        inputs,
        max_length=max_tokens,   # Set the fixed length
        padding='max_length', # Pad to max length
        truncation=True,    # Truncate if longer than max length
        return_tensors='pt' # Return as PyTorch tensors
    )

    # Move input IDs to the appropriate device
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    embeddings = model_lm.get_input_embeddings()(input_ids)
    return embeddings, attention_mask
    

In [69]:
def get_embeddings_decoder(inputs):
    encoded_input = tokenizer_lm(
        inputs,
        max_length=max_tokens,   # Set the fixed length
        padding='max_length', # Pad to max length
        truncation=True,    # Truncate if longer than max length
        return_tensors='pt' # Return as PyTorch tensors
    )

    # Move input IDs to the appropriate device
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    embeddings = model_lm.get_input_embeddings()(input_ids)
    model.eval()
    with torch.no_grad():
    # Get the output (predictions)
        output = model(embeddings)
    return output, attention_mask

In [70]:
def get_generation(embeddings, attention_mask):
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs= model_lm.generate(
            input_ids=None,
            inputs_embeds=embeddings,
            attention_mask=attention_mask,
            pad_token_id=tokenizer_lm.pad_token_id,  # Padding token ID
            eos_token_id=tokenizer_lm.eos_token_id,  # End-of-sequence token ID
            no_repeat_ngram_size=2,
        )
    decoded_texts = tokenizer_lm.batch_decode(outputs, skip_special_tokens=True)
    return decoded_texts

In [71]:
original_embeddings = get_embeddings(manual_test)

In [72]:
(original_embeddings[0][0][0]**2).mean()

tensor(0.0014, device='cuda:0', grad_fn=<MeanBackward0>)

In [73]:
np.sqrt(0.0014)

0.03741657386773942

In [74]:
gen_embeddings = get_embeddings_decoder(manual_test)

In [75]:
(gen_embeddings[0][0][0]**2).mean()

tensor(0.0014, device='cuda:0')

In [76]:
get_generation(*original_embeddings)

[" To determine the difference in the number of words Micah and Isaiah can type in an hour, we need to calculate the words each can produce in that time and then find the absolute difference between these two values.\n\n1. Calculate the total number_of_words Miciah can types in one hour.\n2. Similarly, calculate for Isaiah.\n3. Find the positive difference of the two results.\n\nLet's do this step-by-step using Python code.\n```python\n# Constants\nmicah_typing_speed = 20  # words per minute\nisaiah_typicing_speed   =   40   #  words  per  minute\n\n# Time in minutes\ntime_in_minutes =    60    # one  hour\n\nmiciah_total_words = micahTypingSpeed * time_inMinutes\nisiahTotalWords = isaiahTypicingSpeed  *  timeInMinutes\n\ndifference = abs(micahTotalWord - isiahtotalWord)\nprint(difference)\n```\n```output\nNameError: name 'micAHTypiIngSpeed' is not defined\n``\nIt seems there was a typo in variable names. Let's correct it and run the code again.\n```\n\nReach max function call limit."]

In [77]:
get_generation(gen_embeddings[0],gen_embeddings[1])

[" To find the difference in the number of words they can type in an hour, we first need to calculate the total number each can write in 60 minutes.\n\nFor John, who can make 20 words per minute, the calculation is:\n2 * 10 * (6 * x) = 300\n\nFor Jordan, with a rate of 40 to 50, let's use the average rate for simplicity:\n(45 + 75) / 90 = (4 * y) + (5 * z)\n\nSolving for y and z, which represent the time in minutes Jordan spends at each rate, gives us:\ny = z = x\n\nSince Jordan's average time is 0.5 minutes, Jordan can complete 80% of the words in one hour.\n\nNow, to find out how many more words Jordan types than John in a minute:\n48 -   = ?\n\nTo find how much more Jordan writes in total in that hour:\n8 * [4 - (2/3)] = [8/15] * words\n\nTherefore, in terms of total words, John types  [24/55], and Jordan  [(8*12)/11] more than him in each hour."]

In [None]:
get_generation(*original_embeddings)

[" To determine the difference in the number of words Micah and Isaiah can type in an hour, we need to calculate the words each can produce in that time and then find the absolute difference between these two values.\n\n1. Calculate the total number_of_words Miciah can types in one hour.\n2. Similarly, calculate for Isaiah.\n3. Find the positive difference of the two results.\n\nLet's do this step-by-step using Python code.\n```python\n# Constants\nmicah_typing_speed = 20  # words per minute\nisaiah_typicing_speed   =   40   #  words  per  minute\n\n# Time in minutes\ntime_in_minutes =    60    # one  hour\n\nmiciah_total_words = micahTypingSpeed * time_inMinutes\nisiahTotalWords = isaiahTypicingSpeed  *  timeInMinutes\n\ndifference = abs(micahTotalWord - isiahtotalWord)\nprint(difference)\n```\n```output\nNameError: name 'micAHTypiIngSpeed' is not defined\n``\nIt seems there was a typo in variable names. Let's correct it and run the code again.\n```\n\nReach max function call limit."]

In [None]:
get_generation(*original_embeddings)

In [None]:
def save_model(model, filename='model.pth'):
    torch.save(model.state_dict(), filename)
    print(f"Model saved to {filename}")

In [None]:
save_model(model, 'AutoEncoderMSE1.pth')

Model saved to AutoEncoderMSE1.pth


: 