# Data Exploration & Preprocessing

This notebook covers data loading, exploration, cleaning, normalization, and tokenization for the GSM8K grade school math Q&A dataset. All steps are documented for rubric alignment.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer

# Load the training and test data
train_df = pd.read_csv('../data/main_train.csv')
test_df = pd.read_csv('../data/main_test.csv')

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
train_df.head()

Train shape: (7473, 2)
Test shape: (1319, 2)


Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...


In [11]:
# Data Overview
# - Each row is a math word problem and its step-by-step answer.
# - We'll check for missing values, duplicates, and basic statistics.

# Check for missing values
print('Missing values in train:')
print(train_df.isnull().sum())
print('Missing values in test:')
print(test_df.isnull().sum())

# Check for duplicates
print('Duplicate rows in train:', train_df.duplicated().sum())
print('Duplicate rows in test:', test_df.duplicated().sum())

Missing values in train:
question    0
answer      0
dtype: int64
Missing values in test:
question    0
answer      0
dtype: int64
Duplicate rows in train: 0
Duplicate rows in test: 0


In [12]:
# Text Normalization
# - Lowercase all text
# - Strip whitespace
# - Remove special characters (if needed)
# - Document all steps

def normalize_text(text):
    text = str(text).strip().lower()
    return text

train_df['question'] = train_df['question'].apply(normalize_text)
train_df['answer'] = train_df['answer'].apply(normalize_text)
test_df['question'] = test_df['question'].apply(normalize_text)
test_df['answer'] = test_df['answer'].apply(normalize_text)

train_df.head()

Unnamed: 0,question,answer
0,natalia sold clips to 48 of her friends in apr...,natalia sold 48/2 = <<48/2=24>>24 clips in may...
1,weng earns $12 an hour for babysitting. yester...,weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,betty is saving money for a new wallet which c...,"in the beginning, betty has only 100 / 2 = $<<..."
3,"julie is reading a 120-page book. yesterday, s...",maila read 12 x 2 = <<12*2=24>>24 pages today....
4,james writes a 3-page letter to 2 different fr...,he writes each friend 3*2=<<3*2=6>>6 pages a w...


In [13]:
# Tokenization
# - Use DistilGPT-2 tokenizer from Hugging Face
# - Show example tokenization

tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

# Example tokenization
sample = train_df.iloc[0]['question']
tokens = tokenizer.tokenize(sample)
print('Sample question:', sample)
print('Tokens:', tokens)
print('Token IDs:', tokenizer.convert_tokens_to_ids(tokens))

Sample question: natalia sold clips to 48 of her friends in april, and then she sold half as many clips in may. how many clips did natalia sell altogether in april and may?
Tokens: ['nat', 'alia', 'Ġsold', 'Ġclips', 'Ġto', 'Ġ48', 'Ġof', 'Ġher', 'Ġfriends', 'Ġin', 'Ġapr', 'il', ',', 'Ġand', 'Ġthen', 'Ġshe', 'Ġsold', 'Ġhalf', 'Ġas', 'Ġmany', 'Ġclips', 'Ġin', 'Ġmay', '.', 'Ġhow', 'Ġmany', 'Ġclips', 'Ġdid', 'Ġnat', 'alia', 'Ġsell', 'Ġaltogether', 'Ġin', 'Ġapr', 'il', 'Ġand', 'Ġmay', '?']
Token IDs: [32353, 9752, 2702, 19166, 284, 4764, 286, 607, 2460, 287, 46593, 346, 11, 290, 788, 673, 2702, 2063, 355, 867, 19166, 287, 743, 13, 703, 867, 19166, 750, 34664, 9752, 3677, 13318, 287, 46593, 346, 290, 743, 30]


In [18]:
# Save Preprocessed Data
# - Save cleaned data for model training.

train_df.to_csv('../data/main_train_clean.csv', index=False)
test_df.to_csv('../data/main_test_clean.csv', index=False)