<a href="https://colab.research.google.com/github/LaraSaads/Projects/blob/main/text_summarization_text_generation_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
path = kagglehub.dataset_download("marawanxmamdouh/dialogsum")

In [None]:
import pandas as pd

train= pd.read_csv(f"{path}/CSV/train.csv")
val= pd.read_csv(f"{path}/CSV/validation.csv")
test= pd.read_csv(f"{path}/CSV/test.csv")


In [None]:
train

In [None]:
train.info()


In [None]:
train.drop(columns=['id'],inplace=True)

In [None]:
import re

def clean_text(text):
    if pd.isna(text):
        return ""

    # Remove speaker tags like #Person1#
    text = re.sub(r"#Person\d+#", "", text)

    # Lowercase
    text = text.lower()

    # Remove URLs (if any)
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove special characters (keep basic punctuation)
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [None]:
train["dialogue_clean"] = train["dialogue"].apply(clean_text)
train["summary_clean"]  = train["summary"].apply(clean_text)
train["topic_clean"]    = train["topic"].apply(clean_text)
val["dialogue_clean"] = val["dialogue"].apply(clean_text)
val["summary_clean"]  = val["summary"].apply(clean_text)
val["topic_clean"]    = val["topic"].apply(clean_text)
test["dialogue_clean"] = test["dialogue"].apply(clean_text)
test["summary_clean"]  = test["summary"].apply(clean_text)
test["topic_clean"]    = test["topic"].apply(clean_text)


In [None]:
train

In [None]:
val["dialogue_clean"] = val["dialogue"].apply(clean_text)
val["summary_clean"]  = val["summary"].apply(clean_text)
test["dialogue_clean"] = test["dialogue"].apply(clean_text)
test["summary_clean"]  = test["summary"].apply(clean_text)

In [None]:

train_data = train[["dialogue_clean", "summary_clean"]]
val_data = val[["dialogue_clean", "summary_clean"]]


In [None]:
train_data = train_data.rename(columns={
    "dialogue_clean": "text",
    "summary_clean": "summary"
})

val_data = val_data.rename(columns={
    "dialogue_clean": "text",
    "summary_clean": "summary"
})


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "facebook/bart-large-cnn"

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


In [None]:
def tokenize_function(batch):
    model_inputs = tokenizer(
        batch["text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
from datasets import Dataset

hf_train = Dataset.from_pandas(train_data)
hf_val   = Dataset.from_pandas(val_data)

hf_train = hf_train.map(tokenize_function, batched=True, remove_columns=["text", "summary"])
hf_val   = hf_val.map(tokenize_function, batched=True, remove_columns=["text", "summary"])


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./dialogsum_bart",
    eval_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=2,
    fp16=True,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    tokenizer=tokenizer
)


In [None]:
trainer.train()


In [None]:
!pip install evaluate rouge_score
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

In [None]:
trainer.evaluate()


In [None]:
model.save_pretrained("dialogsum_model")
tokenizer.save_pretrained("dialogsum_model")
!zip -r dialogsum_bart.zip /kaggle/working/dialogsum_bart



In [None]:
def summarize_dialogue(dialogue):
    inputs = tokenizer(
        dialogue,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )

    inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
summarize_dialogue(
    "hi, mr. smith. i'm doctor hawkins. why are you here today?"
)


In [None]:
topic_train = train[["dialogue_clean", "topic_clean"]].rename(
    columns={
        "dialogue_clean": "text",
        "topic_clean": "topic"
    }
)

topic_val = val[["dialogue_clean", "topic_clean"]].rename(
    columns={
        "dialogue_clean": "text",
        "topic_clean": "topic"
    }
)


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5_model_name = "t5-small"

t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)


In [None]:
def tokenize_topic(batch):
    inputs = ["generate topic: " + x for x in batch["text"]]

    model_inputs = t5_tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    labels = t5_tokenizer(
        batch["topic"],
        max_length=16,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
from datasets import Dataset

hf_topic_train = Dataset.from_pandas(topic_train)
hf_topic_val   = Dataset.from_pandas(topic_val)

hf_topic_train = hf_topic_train.map(tokenize_topic, batched=True, remove_columns=["text", "topic"])
hf_topic_val   = hf_topic_val.map(tokenize_topic, batched=True, remove_columns=["text", "topic"])


In [None]:
from transformers import TrainingArguments, Trainer

topic_args = TrainingArguments(
    output_dir="./dialogsum_t5_topic",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=100,
    save_total_limit=2,
    report_to="none"
)

topic_trainer = Trainer(
    model=t5_model,
    args=topic_args,
    train_dataset=hf_topic_train,
    eval_dataset=hf_topic_val,
    tokenizer=t5_tokenizer
)

topic_trainer.train()


In [None]:
t5_model.save_pretrained("dialogsum_topic_model")
t5_tokenizer.save_pretrained("dialogsum_topic_model")



In [None]:
!ls /kaggle/working/


In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bart_model = BartForConditionalGeneration.from_pretrained(
    "/kaggle/working/dialogsum_model"
).to(device)

bart_tokenizer = BartTokenizer.from_pretrained(
    "/kaggle/working/dialogsum_model"
)

bart_model.config.pad_token_id = bart_tokenizer.pad_token_id
bart_model.eval()


In [None]:
def generate_summary(dialogue):
    inputs = bart_tokenizer(
        dialogue,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    summary_ids = bart_model.generate(
        inputs["input_ids"],
        max_length=60,
        min_length=20,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
print(generate_summary(
    "hi, mr. smith. i'm doctor hawkins. why are you here today?"
))

In [None]:
!pip install gradio pyngrok


In [None]:
import gradio as gr

def chat(dialogue):
    summary = generate_summary(dialogue)
    topic = generate_topic(dialogue)
    return summary, topic


In [None]:
interface = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(lines=6, label="Enter Dialogue"),
    outputs=[
        gr.Textbox(label="Generated Summary"),
        gr.Textbox(label="Generated Topic")
    ],
    title="Text Summarization and Generation",
    description="Transformer-based dialogue summarization and topic generation"
)


In [None]:
!killall ngrok

In [None]:
interface.launch(
    share=True,
    server_port=7861  # optional
)


In [None]:
!zip -r dialogsum_bart.zip /kaggle/working/dialogsum_bart
!zip -r dialogsum_t5_topic.zip /kaggle/working/dialogsum_t5_topic
