# **Parameters**

In [None]:
datasetDir = '../datasets'
modelDir = 'saved_models'

# **Import Libraries**

In [None]:
# Import necessary libraries
import pandas as pd
from transformers import GPT2Tokenizer
import json

# **Load Dataset**

In [None]:
# Load dataset (replace with your dataset path)
dataset_name = "path/to/dataset.csv"  
df = pd.read_csv(os.path.join(datasetDir, dataset_name))

# Inspect the dataset
print("Dataset preview:")
print(df.head())

# **Cleaning and Processing Data**

In [None]:
# Data cleaning (remove NaN values, duplicates, etc.)
# df = df.dropna().drop_duplicates()

# Optional: Convert text to lowercase
df["input"] = df["input"].str.lower()
df["response"] = df["response"].str.lower()

# Preview cleaned dataset
print("Cleaned dataset preview:")
print(df.head())


# Tokenize Data

In [None]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"sep_token": "<|sep|>"})  # Adding a special token for input-response separation

# Function to tokenize input and response
def tokenize_data(data, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for _, row in data.iterrows():
        input_text = row["input"]
        response_text = row["response"]
        combined_text = f"{input_text} <|sep|> {response_text}"
        
        # Tokenize the combined text
        tokens = tokenizer(
            combined_text,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        
        input_ids.append(tokens["input_ids"].squeeze().tolist())
        attention_masks.append(tokens["attention_mask"].squeeze().tolist())
    
    return input_ids, attention_masks

# Tokenize the dataset
max_length = 128
input_ids, attention_masks = tokenize_data(df, tokenizer, max_length)

# Save tokenized data into a DataFrame
tokenized_data = pd.DataFrame({
    "input_ids": input_ids,
    "attention_mask": attention_masks
})

# Preview tokenized data
print("Tokenized data preview:")
print(tokenized_data.head())


# **Save Preprocessed Data**

In [None]:
# Save preprocessed and tokenized data to a JSON file for later use
# preprocessed_data_path = "preprocessed_data.json"

# tokenized_data.to_json(preprocessed_data_path, orient="records", lines=True)

# print(f"Preprocessed data saved to {preprocessed_data_path}")