In [None]:
import os
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Step 1: Load and prepare the data
df = pd.read_csv('random_data.csv')

# Print the first few rows to verify the columns
print(df.head())

# Rename columns for clarity if needed
if 'raw' in df.columns and 'login' in df.columns and 'password' in df.columns and 'url' in df.columns:
    df = df.rename(columns={'raw': 'input_text', 'login': 'login_text', 'password': 'password_text', 'url': 'url_text'})
else:
    print("Error: The expected columns 'Raw', 'Login', 'Password', and 'url' were not found in the dataset.")
    print("Available columns: ", df.columns)
    raise KeyError("Expected columns not found.")

# Convert the dataframe to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Tokenizer and model initialization
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Ensure model is on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

def preprocess_function(examples):
    inputs = [f"clean: {text}" for text in examples['input_text']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    
    # Combine url, login, and password into a single target string
    targets = [f"url: {url} login: {login} password: {password}" for url, login, password in zip(examples['url_text'], examples['login_text'], examples['password_text'])]
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')['input_ids']
    
    model_inputs['labels'] = labels
    return model_inputs

# Ensure columns exist in the dataset before mapping
if 'input_text' in dataset.column_names and 'login_text' in dataset.column_names and 'password_text' in dataset.column_names and 'url_text' in dataset.column_names:
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
else:
    print("Error: The expected columns 'input_text', 'login_text', 'password_text', and 'url_text' were not found in the dataset.")
    print("Available columns: ", dataset.column_names)
    raise KeyError("Expected columns not found.")

# Split the dataset into training and validation sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)

# Adjust the data collator to handle padding
def data_collator(features):
    batch = {}
    for key in features[0].keys():
        batch[key] = torch.stack([torch.tensor(f[key]) for f in features])
    
    # Convert dictionary of tensors to a dictionary of lists for tokenizer.pad
    batch = {k: v.tolist() for k, v in batch.items()}
    
    # Pad inputs and labels to the same length
    batch = tokenizer.pad(batch, return_tensors='pt')
    return batch

# Create necessary directories with a different name
output_dir = './training_results_t5'
os.makedirs(output_dir, exist_ok=True)

# Step 4: Define the training arguments and trainer
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator
)

# Step 5: Train the model
trainer.train()

# Step 6: Save the model
model.save_pretrained('./cleaning_model_t5')
tokenizer.save_pretrained('./cleaning_model_t5')



                      url     login      password  \
0  http://FYlF13wL3m.info  mBXoS0WR  TE8EcMCl7mIn   
1   http://dajcTiVFzQ.com  ZV8T2LpD  RLmvvLLyDZOZ   
2   http://n3YiTeUkap.org  5ELt50Vl  SVzO4K2nrTIj   
3                     NaN  dbrpO9rF  5f4WtPt3DHYm   
4  http://CJdxqiA5W0.info  8NbSixAI  d3v2L0Oc1To5   

                                            raw  
0  http://FYlF13wL3m.info|mBXoS0WR|TE8EcMCl7mIn  
1   ZV8T2LpD,RLmvvLLyDZOZ,http://dajcTiVFzQ.com  
2   http://n3YiTeUkap.org:5ELt50Vl:SVzO4K2nrTIj  
3                         dbrpO9rF-5f4WtPt3DHYm  
4  http://CJdxqiA5W0.info~8NbSixAI~d3v2L0Oc1To5  


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cuda


Map:   0%|          | 0/100003 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
