In [None]:
from datasets import load_dataset
import torch
from transformers import T5TokenizerFast, AutoModelForSeq2SeqLM
import os
from src.models.train.model1.train2 import toxicity_classication as tc
import re
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from pathlib import Path


# Get the current working directory
ROOT_DIR = os.getcwd()

# Define the directory where the dataset is located
DATASET_DIR = (Path(ROOT_DIR).parent / 'data').resolve()

# Create the full path for the dataset file
file_path = (DATASET_DIR / 'filtered.tsv').resolve()

# Initialize the current location as the root directory
current_location = ROOT_DIR

# Traverse up the directory tree until 'src' is found in the directory names
while not any('src' in entry.name for entry in os.scandir(current_location)):
    current_location = Path(current_location).parent.resolve()

import sys

# Set the parent directory to the current location
PARENT_DIRECTORY = current_location

# Add the parent directory to the system path for module imports
sys.path.append(str(current_location))


current = ROOT_DIR
while 'src' not in os.listdir(current):
    current = Path(current).parent

import sys
sys.path.append(str(current))

DATA_FOLDER = os.path.join(Path(current).parent, 'data')
data_path = os.path.join(DATA_FOLDER, 'firstprocess.csv')


dataset = load_dataset("csv", data_files=data_path).remove_columns(['lenght_diff','source_tox','similarity', 'target_tox'].reverse())


In [None]:
# Define the checkpoint model as 't5-small'
MODEL_CHECKPOINT = 't5-small'

# Check if a GPU is available; assign 'cuda' if true, else use 'cpu'
DEVICE_TYPE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Create a T5 tokenizer instance based on the specified checkpoint
TOKENIZER_INSTANCE = T5TokenizerFast.from_pretrained(MODEL_CHECKPOINT)

# Load a pre-trained sequence-to-sequence model from the specified checkpoint and transfer it to the chosen device
SEQUENCE_TO_SEQUENCE_MODEL = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT).to(DEVICE_TYPE)


In [None]:
# Prefix to be added to each text in the task
TASK_PREFIX = 'summarize: '

# Function to prepare data by adding the task prefix to each text in the specified batch
def prepare_data(batch, split: str ='source'):
    # Create a list with each text in the batch prefixed with the specified task prefix
    tok_batch = [TASK_PREFIX + s for s in batch[split]]
    
    # Tokenize the prepared batch with truncation enabled
    return TOKENIZER_INSTANCE(tok_batch, truncation=True)


In [None]:
# Sample a subset of data from the 'train' split in the dataset and shuffle it with a seed of 69
sampled_data = dataset['train'].shuffle(seed=69)[:10]

# Tokenize the 'source' text in the sampled data using the prepare_data function
source_tokenized_data = sampled_data.map(prepare_data, batched=True).remove_columns(['source', 'target'].reverse())

# Tokenize the 'target' text in the sampled data using the prepare_data function with split='target'
target_tokenized_data = sampled_data.map(lambda x: prepare_data(x, split='target'), batched=True).remove_columns(['source', 'target', 'attention_mask'].reverse())

# Display the first 10 samples of the tokenized 'source' data
print(source_tokenized_data[:10])

# Display the first 10 samples of the tokenized 'target' data
print(target_tokenized_data[:10])


In [None]:
# Create a data collator with padding using the specified tokenizer
collator_with_padding = DataCollatorWithPadding(tokenizer=TOKENIZER_INSTANCE)

# Create a DataLoader for the tokenized 'source' data with batch size 16, no shuffling, and using the defined data collator
source_data_loader = DataLoader(dataset=source_tokenized_data, batch_size=16, shuffle=False, collate_fn=collator_with_padding)
6
# Create a DataLoader for the tokenized 'target' data with batch size 16, no shuffling, and using the defined data collator
target_data_loader = DataLoader(dataset=target_tokenized_data, batch_size=1, shuffle=False, collate_fn=collator_with_padding)


In [None]:
# Define a function to build a dataset by generating summaries using the model
def build_dataset():
    # Iterate through batches of tokenized 'source' and 'target' data
    i = 0
    j = 0
    for source_batch, target_batch in zip(source_data_loader, target_data_loader):
        # Move the batch to the specified device
        model_batch = {k: v.to(DEVICE_TYPE) for k, v in source_batch.items()}
        print("Finished iteration ", i)
        # Generate summaries using the model
        output = SEQUENCE_TO_SEQUENCE_MODEL.generate(**model_batch)
        output_decoded = TOKENIZER_INSTANCE.batch_decode(output, skip_special_tokens=True)
        
        # Decode the source and target batches
        source = TOKENIZER_INSTANCE.batch_decode(source_batch['input_ids'], skip_special_tokens=True)
        i+=1
        target = TOKENIZER_INSTANCE.batch_decode(target_batch['input_ids'], skip_special_tokens=True)

        # Perform toxic classification on the generated summaries
        summary_toxicity = tc.toxic_classification(output_decoded)

        # Yield dictionaries containing source, target, summary, and summary toxicity information
        for summary, source_text, target_text, toxicity in zip(output_decoded, source, target, summary_toxicity):
            j += 1
            yield {"source": re.sub(TASK_PREFIX, "", source_text), "target": re.sub(TASK_PREFIX, "", target_text), "sum": summary, "sumtox": toxicity}

# Create a dataset from the generator function
model1_dataset = Dataset.from_generator(build_dataset)

# Save the generated dataset to a CSV file in the specified data folder
model1_dataset.to_csv(os.path.join(DATA_FOLDER, 'model1.csv'))


In [None]:
import pandas as pd
# Read the CSV file 'model1.csv' located in the specified data folder into a Pandas DataFrame
model1_dataframe = pd.read_csv(os.path.join(DATA_FOLDER, 'model1.csv'))

# Display the first few rows of the DataFrame
model1_dataframe.head()
