<a href="https://colab.research.google.com/github/Narmathan56/AI-Math-Tutor/blob/main/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from torchtext.vocab import vocab as torch_vocab
import matplotlib.pyplot as plt


# Step 1: Clean text function
# Decode entire column first
X_train = X_train.apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
y_train = y_train.apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
X_test = X_test.apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
y_test = y_test.apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)

def clean_text(text):
    if text is None or pd.isna(text):
       return ""
   # if(text.startswith("b'")and text.endswith("'")):

       # text=text[2:-3].strip()
    # Decode bytes literal if necessary
    if isinstance(text, bytes):
        text = text.decode("utf-8")
    # Assuming UTF-8 encoding

    # Extract labels from Asymptote block before removing it
    asy_labels = re.findall(r'label\([^,]+,"([^"]+)"', text)

    # Remove entire [asy] block
    text = re.sub(r'\[asy\].*?\[/asy\]','', text, flags=re.DOTALL)



    # Remove LaTeX math mode ($...$), LaTeX commands like \mbox{}, etc.
    text = re.sub(r'\$\\?([^$]+)\\?\$', r'\1', text)
    text = re.sub(r'\\\w+\{(.*?)\}', r'\1', text)
    text = text.replace('\\', '')

    # Normalize whitespaces and math operators
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\s*([+\-*/=()^])\s*', r' \1 ', text)

    # Fix parentheses spacing
    text = re.sub(r'\s*\(\s*', '(', text)
    text = re.sub(r'\s*\)\s*', ')', text)

    # Lowercase everything
    text = text.lower()

    # Remove trailing newline characters explicitly if not removed by strip()
    text = text.rstrip('\n')


    # Add logic sentence from Asymptote if present
    if asy_labels:
        logic_steps = [label.lower() for label in asy_labels if "input" not in label and "output" not in label]
        logic_sentence = " the machine " + ", then ".join(logic_steps) + "." if logic_steps else ""

        input_value = next((label.split('=')[1].strip() for label in asy_labels if "input" in label), None)

        if input_value:
            text = f"in the function machine, the input is {input_value}. {text}{logic_sentence} what is the output?"
        else:
            text = f"{text}{logic_sentence} what is the output?"

    return text

# Step 2: Clean datasets
# we are insert the system which've already made by us
X_train_cleaned = X_train.apply(clean_text)
y_train_cleaned = y_train.astype(str).apply(clean_text)
X_test_cleaned = X_test.apply(clean_text)
y_test_cleaned = y_test.astype(str).apply(clean_text)

print("✅ Sample Cleaned Output:", X_train_cleaned.iloc[0 :100])
print("✅ Sample Cleaned Output:", y_train_cleaned.iloc[0])
print("X_train_cleaned_type",type(X_train.iloc[0]))

# Step 3: Tokenization
tokenizer = get_tokenizer("basic_english")
X_train_tokens = X_train_cleaned.apply(tokenizer)
y_train_tokens = y_train_cleaned.apply(tokenizer)
X_test_tokens = X_test_cleaned.apply(tokenizer)
y_test_tokens = y_test_cleaned.apply(tokenizer)

# Step 4: Build vocab from all tokens
def yield_tokens(*data_parts):
    for part in data_parts:
        for tokens in part:
            yield tokens

vocab = build_vocab_from_iterator(
    yield_tokens(X_train_tokens, y_train_tokens, X_test_tokens, y_test_tokens),
    specials=["<unk>", "<pad>", "<bos>", "<eos>"],
    max_tokens=5000
)

vocab.set_default_index(vocab["<unk>"])

# Step 5: Numericalize
def numericalize(token_list, add_bos_eos=False):
    if add_bos_eos:
        token_list = ['<bos>'] + token_list + ['<eos>']
    return torch.tensor([vocab[token] for token in token_list])

X_train_ids = X_train_tokens.apply(lambda x: numericalize(x))
y_train_ids = y_train_tokens.apply(lambda x: numericalize(x, add_bos_eos=True))
X_test_ids = X_test_tokens.apply(lambda x: numericalize(x))
y_test_ids = y_test_tokens.apply(lambda x: numericalize(x, add_bos_eos=True))

# Step 6: Padding
def pad_sequences(sequences, max_len):
    padded_sequences = []
    for seq in sequences:
        # Ensure seq is a tensor before padding
        if not isinstance(seq, torch.Tensor):
            seq = torch.tensor(seq)

        current_len = seq.size(0)
        if current_len < max_len:
            padded_seq = pad(seq, (0, max_len - current_len), value=vocab["<pad>"])
        else:
            padded_seq = seq[:max_len]
        padded_sequences.append(padded_seq)

    return padded_sequences


X_train_tensor = pad_sequences(X_train_ids.tolist(), max_len=10)
y_train_tensor = pad_sequences(y_train_ids.tolist(), max_len=30)
X_test_tensor = pad_sequences(X_test_ids.tolist(), max_len=10)
y_test_tensor = pad_sequences(y_test_ids.tolist(), max_len=30)


# Step 7: Stack into final tensors
X_train_tensor = torch.stack(X_train_tensor)
y_train_tensor = torch.stack(y_train_tensor)
X_test_tensor = torch.stack(X_test_tensor)
y_test_tensor = torch.stack(y_test_tensor)


# Step 8: Output shapes
print("✅ Preprocessing Done!")
print("X_train_tensor shape:", X_train_tensor.shape)
print("y_train_tensor shape:", y_train_tensor.shape)
print("X_test_tensor shape:", X_test_tensor.shape)
print("y_test_tensor shape:", y_test_tensor.shape)
print("📚 Vocabulary size:", len(vocab))
print(X_train_tensor[0])
print(y_train_tensor[0])

✅ Sample Cleaned Output: 0                what is 8 + 8?
1                      7 plus 0
2              calculate 3 + 8.
3                      5 plus 1
4                         0 + 5
                 ...           
100        can you add 8 and 0?
101                add 5 and 3.
102                    9 plus 9
103                       9 + 1
104    find the sum of 4 and 5.
Name: question, Length: 100, dtype: object
✅ Sample Cleaned Output: step 1: add the ones → 8 + 8 = 16. so, the answer is 16. answer: 16
X_train_cleaned_type <class 'str'>
✅ Preprocessing Done!
X_train_tensor shape: torch.Size([647, 10])
y_train_tensor shape: torch.Size([647, 30])
X_test_tensor shape: torch.Size([647, 10])
y_test_tensor shape: torch.Size([647, 30])
📚 Vocabulary size: 51
tensor([31, 10, 17,  8, 17, 25,  1,  1,  1,  1])
tensor([ 2, 16,  7,  9,  5, 13, 15, 17,  8, 17, 12, 48,  4, 14, 11,  5,  6, 10,
        48,  4,  6, 48,  3,  1,  1,  1,  1,  1,  1,  1])
