In [None]:
import torch
import pandas as pd
from torch.optim import AdamW
from transformers import get_scheduler
import torch.nn as nn
from src.load_dataloader import initial_dataloader
from src.load_dataloader import initial_dataloader_cv
from src.evaluation import evaluate
from src.evaluation import evaluate_process
from src.load_config import load_config
from src.evaluation import show_sentence
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
max_length = 128

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model_name = 't5-small'
model = T5ForConditionalGeneration.from_pretrained(model_name, max_length = max_length)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
data = pd.read_csv('data_preprocess/datasets_combine.csv')


Shakespeare_data=data[data['label']==1]
CNN_data=data[data['label']==0]
Shakespeare_data_normal,Shakespeare_data_test=train_test_split(Shakespeare_data,test_size=0.009,random_state=42)
CNN_data_normal,CNN_data_test=train_test_split(CNN_data,test_size=0.009,random_state=42)
data_final_test=pd.concat([Shakespeare_data_test,CNN_data_test])


In [None]:
Shakespeare_data_model1,Shakespeare_data_model2=train_test_split(Shakespeare_data_normal,test_size=0.5,random_state=42)
CNN_data_model1,CNN_data_model2=train_test_split(CNN_data_normal,test_size=0.5,random_state=42)


In [None]:
from torch.utils.data import Dataset, DataLoader
class T5Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



In [None]:
def prepare_data(cnn_sentences, shakespeare_sentences, tokenizer, max_length=512):
    model_inputs = tokenizer(cnn_sentences, max_length=max_length, truncation=True, padding='max_length', return_tensors="pt")
    # Prepare the labels for the T5 model which should be the IDs of the Shakespeare sentences
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(shakespeare_sentences, max_length=max_length, truncation=True, padding='max_length', return_tensors="pt").input_ids
    # Replace padding token id's of the labels by -100 so it's ignored by the loss function
    labels[labels == tokenizer.pad_token_id] = -100
    
    return T5Dataset({"input_ids": model_inputs.input_ids, "attention_mask": model_inputs.attention_mask, "labels": labels})


In [None]:
CNN_data_train, CNN_data_val = train_test_split(CNN_data_model1, test_size=0.1, random_state=42)
Shakespeare_data_train, Shakespeare_data_val = train_test_split(Shakespeare_data_model1, test_size=0.1, random_state=42)
model11_train_dataset = prepare_data(CNN_data_train['sentence'].tolist(), Shakespeare_data_train['sentence'].tolist(), tokenizer)
model11_val_dataset = prepare_data(CNN_data_val['sentence'].tolist(), Shakespeare_data_val['sentence'].tolist(), tokenizer)



In [None]:
CNN_data_train2, CNN_data_val2 = train_test_split(CNN_data_model2, test_size=0.1, random_state=42)
Shakespeare_data_train2, Shakespeare_data_val2 = train_test_split(Shakespeare_data_model2, test_size=0.1, random_state=42)
model22_train_dataset = prepare_data(CNN_data_train2['sentence'].tolist(), Shakespeare_data_train2['sentence'].tolist(), tokenizer)
model22_val_dataset = prepare_data(CNN_data_val2['sentence'].tolist(), Shakespeare_data_val2['sentence'].tolist(), tokenizer)


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model11 = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,
    do_train=True,                   # whether to run training
    do_eval=True,                    # whether to run eval on the dev set
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    eval_steps=500,                  # evaluation step
    save_steps=500,                  # save checkpoint every 500 steps
    save_total_limit=1,              # only keep the most recent checkpoint
)

# Initialize Trainer
trainer = Trainer(
    model=model11,
    args=training_args,
    train_dataset=model11_train_dataset,  # your training dataset
    eval_dataset=model11_val_dataset,   # your evaluation dataset
    # You can also include a compute_metrics function to calculate metrics during evaluation
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./saved_model')

# Evaluate the model
trainer.evaluate()

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model22 = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# Define training arguments
training_args2 = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=200,
    do_train=True,                   # whether to run training
    do_eval=True,                    # whether to run eval on the dev set
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    eval_steps=500,                  # evaluation step
    save_steps=500,                  # save checkpoint every 500 steps
    save_total_limit=1,              # only keep the most recent checkpoint
)

# Initialize Trainer
trainer2 = Trainer(
    model=model11,
    args=training_args2,
    train_dataset=model22_train_dataset,  # your training dataset
    eval_dataset=model22_val_dataset,   # your evaluation dataset
    # You can also include a compute_metrics function to calculate metrics during evaluation
)

# Train the model
trainer2.train()

# Save the model
trainer2.save_model('./saved_model')

# Evaluate the model
trainer2.evaluate()

In [None]:
def generate_sentence(sentence, label, model1, model2, tokenizer, device, max_length=512):
    # Choose the model based on the label
    model = model2 if label == 1 else model1
    
    # Tokenize the input sentence
    inputs = tokenizer.encode(sentence, return_tensors="pt", add_special_tokens=True).to(device)
    
    # Generate the output using the model
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=max_length)
    
    # Decode the generated sentence
    generated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_sentence

# Example usage for generating text with the models
def generate_from_test_set(test_set, model1, model2, tokenizer, device):
    generated_sentences = []
    for index, row in test_set.iterrows():
        sentence = row['sentence']
        label = row['label']
        generated = generate_sentence(sentence, label, model1, model2, tokenizer, device)
        generated_sentences.append(generated)
        if len(generated_sentences) >= 10:  # Stop after 10 sentences
            break
    return generated_sentences

# Assuming data_final_test is a pandas DataFrame with the test set
generated_sentences = generate_from_test_set(data_final_test, model11, model22, tokenizer, device)

for original, generated in zip(data_final_test['sentence'].tolist()[:10], generated_sentences):
    print(f"Original: {original}")
    print(f"Generated: {generated}\n")