In [1]:
import pandas as pd

# Load the dataset
arxiv_df = pd.read_csv('filtered_data.csv')



In [6]:
print(arxiv_df.columns)


Index(['title', 'abstract'], dtype='object')


In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW, get_scheduler
import torch.nn as nn

# Load datasets
compscholar_df = pd.read_csv(r"Brain Dead CompScholar Dataset.csv")
pubmed_df = pd.read_csv('pubmed_articles.csv')


tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

# Preprocessing function
def preprocess_text(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")


In [4]:
compscholar_df.isnull().sum()
compscholar_df.dropna(inplace=True)
pubmed_df.isnull().sum()
pubmed_df.dropna(inplace=True)  
arxiv_df.isnull().sum()
arxiv_df.dropna(inplace=True)

In [5]:
compscholar_df.rename(columns={'Paper Title': 'Title'}, inplace=True)


In [6]:
arxiv_df.rename(columns={'title': 'Title'}, inplace=True)


In [7]:
arxiv_df.rename(columns={'abstract': 'Abstract'}, inplace=True)


In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, dataframe):
        self.input_ids = dataframe['Abstract'].apply(lambda x: preprocess_text(x)['input_ids'].squeeze(0)).tolist()
        self.attention_mask = dataframe['Abstract'].apply(lambda x: preprocess_text(x)['attention_mask'].squeeze(0)).tolist()
        self.labels = dataframe['Title'].apply(lambda x: preprocess_text(f"stok {x} etok")['input_ids'].squeeze(0)).tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Combine datasets
train_dataset = ConcatDataset([
    SummarizationDataset(arxiv_df),
    SummarizationDataset(pubmed_df)
])
test_dataset = SummarizationDataset(compscholar_df)


In [15]:

class HybridSummarizer(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HybridSummarizer().to(device)

# Training Setup
optimizer = AdamW(model.parameters(), lr=2e-5)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
num_training_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Loop
for epoch in range(3):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
# Save the model
torch.save(model.state_dict(), "hybrid_summarizer.pth")
print("Model training complete. Saved as 'hybrid_summarizer.pth'.")

















In [None]:
from datasets import load_metric


model.eval()
rouge = load_metric("rouge")

generated_summaries = []
reference_summaries = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = torch.stack(batch['input_ids']).to(device)
        attention_mask = torch.stack(batch['attention_mask']).to(device)
        labels = batch['labels']

        outputs = model.model.generate(input_ids, attention_mask=attention_mask, max_length=150)
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        reference_texts = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

        generated_summaries.extend(generated_texts)
        reference_summaries.extend(reference_texts)

# Compute ROUGE Score
rouge_scores = rouge.compute(predictions=generated_summaries, references=reference_summaries)
print("ROUGE Scores:", rouge_scores)

In [None]:
# import torch
# from transformers import BartForConditionalGeneration, BartTokenizer

# # Load tokenizer and model
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Initialize model and load trained weights
# model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
# model.load_state_dict(torch.load("hybrid_summarizer.pth", map_location=device))
# model.to(device)
# model.eval()


In [None]:
# def generate_summary(text, max_length=100):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
#     summary_ids = model.generate(**inputs, max_length=max_length, num_beams=5, early_stopping=True)
#     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# # Sample input abstracts
# test_abstracts = [
#     "Deep learning has revolutionized various AI applications. This paper explores novel techniques in transformer architectures.",
#     "A new approach to drug discovery using AI and quantum computing is proposed, significantly reducing time and cost."
# ]

# # Generate and print summaries
# for abstract in test_abstracts:
#     print("Abstract:", abstract)
#     print("Generated Summary:", generate_summary(abstract))
#     print("-" * 50)


In [None]:
# from rouge_score import rouge_scorer

# # Reference (Ground Truth) Summaries
# reference_summaries = [
#     "Deep learning transforms AI with improved transformer models.",
#     "AI and quantum computing accelerate drug discovery."
# ]

# # Compute ROUGE Scores
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# for i, abstract in enumerate(test_abstracts):
#     generated_summary = generate_summary(abstract)
#     scores = scorer.score(reference_summaries[i], generated_summary)

#     print(f"Sample {i+1} ROUGE Scores:")
#     print("ROUGE-1:", scores['rouge1'].fmeasure)
#     print("ROUGE-2:", scores['rouge2'].fmeasure)
#     print("ROUGE-L:", scores['rougeL'].fmeasure)
#     print("-" * 50)


In [None]:
# from rouge_score import rouge_scorer
# from sacrebleu import corpus_bleu
# import nltk

# # Download NLTK tokenizer
# nltk.download('punkt')

# def compute_rouge(reference_summaries, generated_summaries):
#     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#     scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

#     for ref, gen in zip(reference_summaries, generated_summaries):
#         score = scorer.score(ref, gen)
#         scores['rouge1'].append(score['rouge1'].fmeasure)
#         scores['rouge2'].append(score['rouge2'].fmeasure)
#         scores['rougeL'].append(score['rougeL'].fmeasure)

#     return {metric: sum(values) / len(values) for metric, values in scores.items()}

# def compute_bleu(reference_summaries, generated_summaries):
#     references = [[nltk.word_tokenize(ref)] for ref in reference_summaries]
#     hypotheses = [nltk.word_tokenize(gen) for gen in generated_summaries]
#     return corpus_bleu(hypotheses, references).score


In [None]:
# def generate_summaries(model, test_loader, tokenizer):
#     model.eval()
#     generated_summaries = []
#     reference_summaries = []

#     with torch.no_grad():
#         for batch in test_loader:
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels']

#             outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=150)
#             generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#             reference_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

#             generated_summaries.extend(generated_texts)
#             reference_summaries.extend(reference_texts)

#     return reference_summaries, generated_summaries


In [None]:
# reference_summaries, generated_summaries = generate_summaries(model, test_loader, tokenizer)

# # Compute ROUGE Scores
# rouge_scores = compute_rouge(reference_summaries, generated_summaries)
# print("ROUGE Scores:", rouge_scores)

# # Compute BLEU Score
# bleu_score = compute_bleu(reference_summaries, generated_summaries)
# print("BLEU Score:", bleu_score)
