In [None]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/'

Mounted at /content/gdrive


In [None]:
!pip install snorkel pandas numpy sentence-transformers transformers textattack yake scikit-learn


Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting textattack
  Downloading textattack-0.3.10-py3-none-any.whl.metadata (38 kB)
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Collecting bert-score>=0.3.5 (from textattack)
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting flair (from textattack)
  Downloading flair-0.15.0-py3-none-any.whl.metadata (12 kB)
Collecting language-tool-python (from textattack)
  Downloading language_tool_python-2.8.2-py3-none-any.whl.metadata (14 kB)
Collecting lemminflect (from textattack)
  Downloading lemminflect-0.2.3-py3-none-any.whl.metadata (7.0 kB)
Collecting lru-dict (from textattack)
  Downloading lru_dict-1.3.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting da

In [None]:
import re
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from snorkel.labeling import LabelingFunction, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load labeled dataset
labeled_df = pd.read_csv("gdrive/My Drive/PerAnsSumm_Combined.csv", names=["uri", "text", "label"])

# Define perspective categories
PERSPECTIVES = ["EXPERIENCE", "INFORMATION", "CAUSE", "SUGGESTION", "QUESTION"]

# Load a pre-trained sentence embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Convert dataset text to embeddings
X_embeddings = embedding_model.encode(labeled_df["text"].tolist(), convert_to_numpy=True)

# Train a Logistic Regression classifier (better than SVM for probability outputs)
logreg_classifier = LogisticRegression(max_iter=1000, solver="liblinear")
logreg_classifier.fit(X_embeddings, labeled_df["label"])

# Snorkel Labeling Functions using TF-IDF based keyword selection
vectorizer = TfidfVectorizer(max_features=50, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(labeled_df["text"])
feature_names = vectorizer.get_feature_names_out()

def create_lf(label, keywords):
    """Create a Snorkel labeling function for keyword-based weak supervision."""
    pattern = r"\b(" + "|".join(map(re.escape, keywords)) + r")\b"
    def lf(x):
        text = str(x) if isinstance(x, (str, bytes)) else ""
        return label if re.search(pattern, text, re.IGNORECASE) else -1
    return LabelingFunction(name=f"lf_{label}", f=lf)

# Generate keyword-based patterns for Snorkel
regex_patterns = {}

for label in PERSPECTIVES:
    class_samples = labeled_df[labeled_df["label"] == label]["text"]
    if len(class_samples) > 0:
        words = " ".join(class_samples).split()
        keywords = [word for word in words if word in feature_names][:10]  # Top TF-IDF words
        regex_patterns[label] = keywords

# Create Snorkel labeling functions
labeling_functions = [create_lf(label, regex_patterns[label]) for label in regex_patterns]

# Apply Snorkel weak supervision
applier = PandasLFApplier(lfs=labeling_functions)
L_train = applier.apply(df=labeled_df)

# Train Snorkel Label Model
label_model = LabelModel(cardinality=len(PERSPECTIVES), verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100)

# Zero-Shot Classification Model (Few-Shot Learning)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def zero_shot_classify(sentences):
    """Batch classify sentences using Zero-Shot Learning for efficiency."""
    results = classifier(sentences, PERSPECTIVES, multi_label=False)
    return [res["labels"][0] for res in results]

# Load unlabeled dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Process dataset and classify answers
results = []

for entry in data:
    uri = entry["uri"]
    categorized_spans = {perspective: [] for perspective in PERSPECTIVES}

    for ans in entry["answers"]:
        sentences = re.split(r"(?<=[.!?])\s+", ans.strip())  # Split into sentences

        # Apply Snorkel Labeling
        L_test = applier.apply(pd.DataFrame({"text": sentences}))
        weak_labels = label_model.predict(L=L_test)

        # Apply Logistic Regression using sentence embeddings
        X_test_embeddings = embedding_model.encode(sentences, convert_to_numpy=True)
        logreg_preds = logreg_classifier.predict(X_test_embeddings)

        # Final classification logic
        final_labels = []
        for i, sentence in enumerate(sentences):
            weak_label = weak_labels[i]
            if weak_label != -1:
                final_label = PERSPECTIVES[weak_label]  # Use Snorkel if available
            else:
                final_label = logreg_preds[i]  # Otherwise, use Logistic Regression

            if final_label not in PERSPECTIVES:
                final_label = zero_shot_classify([sentence])[0]  # Few-shot learning

            categorized_spans[final_label].append(sentence)

    results.append({"uri": uri, "spans": categorized_spans})

# Save results to JSON file
with open("gdrive/My Drive/classified_perspectives_improvedLast.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("Classification completed! Results saved to classified_perspectives_improved.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 14513/14513 [00:00<00:00, 87749.53it/s]
100%|██████████| 500/500 [00:00<00:00, 711.18epoch/s]


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
100%|██████████| 6/6 [00:00<00:00, 4688.99it/s]
100%|██████████| 4/4 [00:00<00:00, 1796.08it/s]
100%|██████████| 2/2 [00:00<00:00, 1244.23it/s]
100%|██████████| 14/14 [00:00<00:00, 3759.06it/s]
100%|██████████| 13/13 [00:00<00:00, 7119.20it/s]
100%|██████████| 4/4 [00:00<00:00, 5438.32it/s]
100%|██████████| 3/3 [00:00<00:00, 3973.13it/s]
100%|██████████| 1/1 [00:00<00:00, 1239.82it/s]
100%|██████████| 4/4 [00:00<00:00, 3294.82it/s]
100%|██████████| 8/8 [00:00<00:00, 11602.50it/s]
100%|██████████| 8/8 [00:00<00:00, 6915.59it/s]
100%|██████████| 9/9 [00:00<00:00, 8784.90it/s]
100%|██████████| 1/1 [00:00<00:00, 1228.56it/s]
100%|██████████| 3/3 [00:00<00:00, 4275.54it/s]
100%|██████████| 3/3 [00:00<00:00, 1510.55it/s]
100%|██████████| 1/1 [00:00<00:00, 675.96it/s]
100%|██████████| 9/9 [00:00<00:00, 7965.55it/s]
100%|██████████| 3/3 [00:00<00:00, 1130.34it/s]
100%|██████████| 9/9 [00:00<00:00, 11938.25it/s]
100%|██████████| 2/2 [00:00<00:00, 1495.03it/s]
100%|████████

✅ Classification completed! Results saved to classified_perspectives_improved.json


In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BART and Pegasus models
bart_model_name = "facebook/bart-large-cnn"
pegasus_model_name = "google/pegasus-xsum"

# Initialize tokenizers and models
bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name)

pegasus_tokenizer = AutoTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(pegasus_model_name)

# Function to generate an extractive summary using BART
def generate_extractive_summary(text_list):
    if not text_list:
        return ""

    full_text = " ".join(text_list)
    inputs = bart_tokenizer.encode("summarize: " + full_text, return_tensors="pt", truncation=True, max_length=1024)

    summary_ids = bart_model.generate(
        inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True
    )

    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Function to refine summary using Pegasus (Abstractive Summarization)
def refine_with_pegasus(summary_text):
    if not summary_text:
        return ""

    inputs = pegasus_tokenizer.encode("summarize: " + summary_text, return_tensors="pt", truncation=True, max_length=512)

    summary_ids = pegasus_model.generate(
        inputs, max_length=100, min_length=30, length_penalty=1.8, num_beams=6, early_stopping=True
    )

    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Load dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data_file = json.load(f)

with open("gdrive/My Drive/classified_perspectives_improvedLast.json", "r", encoding="utf-8") as f:
    classified_perspectives = json.load(f)

data_lookup = {entry["uri"]: entry for entry in data_file}

# Process entries
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri in data_lookup:
        question = data_lookup[uri]["question"]
        context = data_lookup[uri]["context"]

        # Generate extractive summary (BART)
        summaries = {category: generate_extractive_summary(texts) for category, texts in spans.items()}

        # Refine summaries using Pegasus
        refined_summaries = {category: refine_with_pegasus(summary) for category, summary in summaries.items()}

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "extractive_summaries": summaries,
            "refined_summaries": refined_summaries
        }
        output_data.append(output_entry)

# Save results
output_path = "gdrive/My Drive/final_hybrid_summarized_output.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print(f"✅ Hybrid summarization (BART + Pegasus) completed! Output saved to {output_path}")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Hybrid summarization (BART + Pegasus) completed! Output saved to gdrive/My Drive/final_hybrid_summarized_output.json


In [None]:
import pandas as pd

# Load CSV file correctly
labeled_df = pd.read_csv("gdrive/My Drive/PerAnsSumm_Combined.csv", names=["uri", "text", "label"], header=0)

# Display first few rows
labeled_df.head()


Unnamed: 0,uri,text,label
0,4367393,Parkinson's disease is one of the most common ...,INFORMATION
1,4367393,Parkinsonism describes the common symptoms of ...,INFORMATION
2,1504599,duck tape,SUGGESTION
3,1504599,e tell your husband to record you one day and ...,SUGGESTION
4,1504599,shove a sock in your mout,SUGGESTION


In [None]:
#Final Classification
import re
import json
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from snorkel.labeling import LabelingFunction, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Load labeled dataset
labeled_df = pd.read_csv("gdrive/My Drive/PerAnsSumm_Combined.csv", names=["uri", "text", "label"])

# Define perspective categories
PERSPECTIVES = ["EXPERIENCE", "INFORMATION", "CAUSE", "SUGGESTION", "QUESTION"]

# Load a pre-trained sentence embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert dataset text to embeddings
X_embeddings = embedding_model.encode(labeled_df["text"].tolist(), convert_to_numpy=True)

# Train an SVM classifier using sentence embeddings
svm_classifier = SVC(kernel="linear", probability=True)
svm_classifier.fit(X_embeddings, labeled_df["label"])

# Snorkel Labeling Functions
def create_lf(label, pattern):
    """Create a Snorkel labeling function for regex-based weak supervision."""
    def lf(x):
        text = str(x) if isinstance(x, (str, bytes)) else ""
        return label if re.search(pattern, text, re.IGNORECASE) else -1
    return LabelingFunction(name=f"lf_{label}", f=lf)

# Generate regex patterns based on dataset
regex_patterns = {}

for label in PERSPECTIVES:
    class_samples = labeled_df[labeled_df["label"] == label]["text"]
    if len(class_samples) > 0:
        common_words = " | ".join(map(re.escape, " ".join(class_samples).split()[:10]))
        regex_patterns[label] = common_words

# Create Snorkel labeling functions
labeling_functions = [create_lf(label, regex_patterns[label]) for label in regex_patterns]

# Apply Snorkel weak supervision
applier = PandasLFApplier(lfs=labeling_functions)
L_train = applier.apply(df=labeled_df)

# Train Snorkel Label Model
label_model = LabelModel(cardinality=len(PERSPECTIVES), verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100)

# Zero-Shot Classification Model (Few-Shot Learning)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def zero_shot_classify(text):
    """Classify text using Zero-Shot Learning when Snorkel and SVM fail."""
    result = classifier(text, PERSPECTIVES)
    return result["labels"][0]  # Return the highest-ranked label

# Load unlabeled dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Process dataset and classify answers
results = []

for entry in data:
    uri = entry["uri"]
    categorized_spans = {perspective: [] for perspective in PERSPECTIVES}

    for ans in entry["answers"]:
        sentences = re.split(r"(?<=[.!?])\s+", ans.strip())  # Split into sentences

        # Apply Snorkel Labeling
        L_test = applier.apply(pd.DataFrame({"text": sentences}))
        weak_labels = label_model.predict(L=L_test)

        # Apply SVM for refined classification using sentence embeddings
        X_test_embeddings = embedding_model.encode(sentences, convert_to_numpy=True)
        svm_preds = svm_classifier.predict(X_test_embeddings)

        for i, sentence in enumerate(sentences):
            weak_label = weak_labels[i]
            if weak_label != -1:
                final_label = PERSPECTIVES[weak_label]  # Use Snorkel if available
            else:
                final_label = svm_preds[i]  # Otherwise, use SVM

            # Use Zero-Shot if SVM & Snorkel both fail
            if final_label not in PERSPECTIVES:
                final_label = zero_shot_classify(sentence)

            categorized_spans[final_label].append(sentence)

    results.append({"uri": uri, "spans": categorized_spans})

# Save results to JSON file
with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("✅ Classification completed! Results saved to classified_perspectives_improved.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 14513/14513 [00:00<00:00, 51259.89it/s]
100%|██████████| 500/500 [00:00<00:00, 581.46epoch/s]


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
100%|██████████| 6/6 [00:00<00:00, 6221.46it/s]
100%|██████████| 4/4 [00:00<00:00, 2318.90it/s]
100%|██████████| 2/2 [00:00<00:00, 2152.58it/s]
100%|██████████| 14/14 [00:00<00:00, 11264.20it/s]
100%|██████████| 13/13 [00:00<00:00, 11498.51it/s]
100%|██████████| 4/4 [00:00<00:00, 5959.93it/s]
100%|██████████| 3/3 [00:00<00:00, 1955.69it/s]
100%|██████████| 1/1 [00:00<00:00, 704.45it/s]
100%|██████████| 4/4 [00:00<00:00, 2643.75it/s]
100%|██████████| 8/8 [00:00<00:00, 7972.07it/s]
100%|██████████| 8/8 [00:00<00:00, 8093.21it/s]
100%|██████████| 9/9 [00:00<00:00, 8882.06it/s]
100%|██████████| 1/1 [00:00<00:00, 1267.93it/s]
100%|██████████| 3/3 [00:00<00:00, 3129.30it/s]
100%|██████████| 3/3 [00:00<00:00, 1954.48it/s]
100%|██████████| 1/1 [00:00<00:00, 1023.00it/s]
100%|██████████| 9/9 [00:00<00:00, 9348.37it/s]
100%|██████████| 3/3 [00:00<00:00, 3271.69it/s]
100%|██████████| 9/9 [00:00<00:00, 4742.30it/s]
100%|██████████| 2/2 [00:00<00:00, 573.15it/s]
100%|█████████

✅ Classification completed! Results saved to classified_perspectives_improved.json


In [None]:
#new:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BART extractive summarization model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to determine dynamic summarization length
def get_dynamic_lengths(text):
    """Dynamically set max & min summary lengths based on input text length."""
    words = len(text.split())

    max_length = min(200, int(words * 0.5))  # Max summary is up to 50% of original
    min_length = max(30, int(words * 0.2))  # Min summary is at least 20% of original
    return max_length, min_length

# Function to generate extractive summary with dynamic length control
def generate_extractive_summary(texts):
    if not texts:
        return ""

    full_text = " ".join(texts)
    max_length, min_length = get_dynamic_lengths(full_text)

    inputs = tokenizer(
        "summarize: " + full_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding="max_length"
    )

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=1.2,  # Encourage concise summaries
        num_beams=6,  # Increase beams for diversity
        repetition_penalty=1.1,  # Reduce redundancy
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Load datasets efficiently
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data_file = json.load(f)

with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as f:
    classified_perspectives = json.load(f)

# Create a lookup dictionary for faster access
data_lookup = {entry["uri"]: entry for entry in data_file}

# Process entries with optimized lookups
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry.get("spans", {})

    # Fetch context efficiently
    context_data = data_lookup.get(uri, {})
    question = context_data.get("question", "")
    context = context_data.get("context", "")

    # Generate summaries for each perspective
    summaries = {category: generate_extractive_summary(texts) for category, texts in spans.items()}

    output_entry = {
        "uri": uri,
        "question": question,
        "context": context,
        "spans": spans,
        "summaries": summaries
    }
    output_data.append(output_entry)

# Save results
output_path = "gdrive/My Drive/final_extractive_summarized_SVM.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print(f"✅ Extractive summarization completed! Output saved to {output_path}")


In [None]:
#Second Version SVM
import re
import json
import numpy as np
import pandas as pd
import time
from enum import Enum
from collections import Counter
from sklearn.svm import SVC
from snorkel.labeling import LabelingFunction, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Enum for Perspective Labels
class Perspective(Enum):
    EXPERIENCE = "EXPERIENCE"
    INFORMATION = "INFORMATION"
    CAUSE = "CAUSE"
    SUGGESTION = "SUGGESTION"
    QUESTION = "QUESTION"

PERSPECTIVES = [p.value for p in Perspective]

# Load labeled dataset
labeled_df = pd.read_csv("gdrive/My Drive/PerAnsSumm_Combined.csv", names=["uri", "text", "label"])

# Load Sentence Embedding Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode text
X_embeddings = embedding_model.encode(labeled_df["text"].tolist(), convert_to_numpy=True)

# Train SVM Classifier
svm_classifier = SVC(kernel="linear", probability=True)
svm_classifier.fit(X_embeddings, labeled_df["label"])

# Snorkel Labeling Functions
def create_lf(label, pattern):
    """Create a Snorkel labeling function for regex-based weak supervision."""
    def lf(x):
        text = str(x) if isinstance(x, (str, bytes)) else ""
        return label if re.search(pattern, text, re.IGNORECASE) else -1
    return LabelingFunction(name=f"lf_{label}", f=lf)

# Precompute Regex Patterns
regex_patterns = {
    label: " | ".join(map(re.escape, " ".join(labeled_df[labeled_df["label"] == label]["text"]).split()[:10]))
    for label in PERSPECTIVES
}

# Create Snorkel Labeling Functions
labeling_functions = [create_lf(label, regex_patterns[label]) for label in regex_patterns]

# Apply Snorkel
applier = PandasLFApplier(lfs=labeling_functions)
L_train = applier.apply(df=labeled_df)

# Train Snorkel Label Model
label_model = LabelModel(cardinality=len(PERSPECTIVES), verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100)

# Zero-Shot Classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def zero_shot_classify(text):
    """Classify text using Zero-Shot Learning when Snorkel and SVM fail."""
    result = classifier(text, PERSPECTIVES)
    return result["labels"][0]

# Load Unlabeled Dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Processing Entries
start_time = time.time()
results = []

for entry in data:
    uri = entry["uri"]
    categorized_spans = {perspective: [] for perspective in PERSPECTIVES}

    # Extract Sentences
    sentences_list = [sentence for ans in entry["answers"] for sentence in re.split(r"(?<=[.!?])\s+", ans.strip())]

    if sentences_list:
        # Apply Snorkel Labeling
        L_test = applier.apply(pd.DataFrame({"text": sentences_list}))
        weak_labels = label_model.predict(L=L_test)

        # Apply SVM for refined classification
        X_test_embeddings = embedding_model.encode(sentences_list, convert_to_numpy=True)
        svm_preds = svm_classifier.predict(X_test_embeddings)

        for i, sentence in enumerate(sentences_list):
            weak_label = weak_labels[i]
            svm_label = svm_preds[i]

            # Majority Voting
            final_label = Counter([weak_label, svm_label]).most_common(1)[0][0]

            # Use Zero-Shot if both fail
            if final_label not in PERSPECTIVES:
                final_label = zero_shot_classify(sentence)

            categorized_spans[final_label].append(sentence)

    results.append({"uri": uri, "spans": categorized_spans})

# Save Results
with open("gdrive/My Drive/classified_perspectives_improvedSVM2b.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"✅ Classification completed in {time.time() - start_time:.2f} seconds! Results saved.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 14513/14513 [00:00<00:00, 74475.02it/s]
100%|██████████| 500/500 [00:00<00:00, 538.16epoch/s]


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
100%|██████████| 26/26 [00:00<00:00, 4380.83it/s]
100%|██████████| 20/20 [00:00<00:00, 7446.61it/s]
100%|██████████| 13/13 [00:00<00:00, 6232.25it/s]
100%|██████████| 24/24 [00:00<00:00, 8599.29it/s]
100%|██████████| 22/22 [00:00<00:00, 5297.97it/s]
100%|██████████| 28/28 [00:00<00:00, 21213.97it/s]
100%|██████████| 22/22 [00:00<00:00, 13248.34it/s]
100%|██████████| 21/21 [00:00<00:00, 6949.14it/s]
100%|██████████| 48/48 [00:00<00:00, 26900.93it/s]
100%|██████████| 20/20 [00:00<00:00, 14428.29it/s]
100%|██████████| 27/27 [00:00<00:00, 8033.93it/s]
100%|██████████| 23/23 [00:00<00:00, 15945.29it/s]
100%|██████████| 27/27 [00:00<00:00, 14442.83it/s]
100%|██████████| 15/15 [00:00<00:00, 6799.37it/s]
100%|██████████| 21/21 [00:00<00:00, 8127.75it/s]
100%|██████████| 10/10 [00:00<00:00, 5113.76it/s]
100%|██████████| 12/12 [00:00<00:00, 5313.73it/s]
100%|██████████| 36/36 [00:00<00:00, 16455.42it/s]
100%|██████████| 15/15 [00:00<00:00, 6818.53it/s]
100%|██████████| 29/2

✅ Classification completed in 5180.46 seconds! Results saved.


In [None]:
import json
import csv

def extract_perspective_data(json_file, csv_file):
    """
    Reads a JSON file, extracts the URI, combined perspective texts as context,
    and the related summary, then writes them to a CSV file.

    :param json_file: Path to the JSON input file
    :param csv_file: Path to the output CSV file
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []

    for item in data:
        uri = item.get("uri", "")

        # Combine all perspective texts into context
        context_texts = []
        for perspective, answers in item.get("labelled_answer_spans", {}).items():
            context_texts.extend([answer["txt"] for answer in answers])
        context = " ".join(context_texts)

        # Extract related summary
        summary = " ".join(item.get("labelled_summaries", {}).values())

        extracted_data.append([uri, context, summary])

    # Write to CSV
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["URI", "Context", "Summary"])
        writer.writerows(extracted_data)

    print(f"CSV file saved: {csv_file}")

# Example usage
extract_perspective_data("gdrive/My Drive/PerAnsSummtrain.json", "gdrive/My Drive/PerAnsSummtrainOutput.csv")


CSV file saved: gdrive/My Drive/PerAnsSummtrainOutput.csv


In [None]:
import json
import csv

def extract_perspective_data(json_file, csv_file):
    """
    Reads a JSON file, extracts the URI, combines texts from each perspective category,
    and maps them to their respective summaries before writing them to a CSV file.

    :param json_file: Path to the JSON input file
    :param csv_file: Path to the output CSV file
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []

    for item in data:
        uri = item.get("uri", "")
        labelled_answer_spans = item.get("labelled_answer_spans", {})
        labelled_summaries = item.get("labelled_summaries", {})

        for perspective, texts in labelled_answer_spans.items():
            context = " ".join([entry["txt"] for entry in texts])  # Combine all text from a perspective
            summary = labelled_summaries.get(f"{perspective}_SUMMARY", "")  # Fetch related summary
            extracted_data.append([uri, perspective, context, summary])

    # Write to CSV
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["URI", "Perspective", "Context", "Summary"])
        writer.writerows(extracted_data)

    print(f"CSV file saved: {csv_file}")

# Example usage
extract_perspective_data("gdrive/My Drive/PerAnsSummtrain.json", "gdrive/My Drive/PerAnsSummtrainOutput.csv")


CSV file saved: gdrive/My Drive/PerAnsSummtrainOutput.csv


In [None]:
import json
import csv

def extract_perspective_data(json_file, csv_file):
    """
    Reads a JSON file, extracts the URI, combines texts from each perspective category,
    and maps them to their respective summaries before writing them to a CSV file.

    :param json_file: Path to the JSON input file
    :param csv_file: Path to the output CSV file
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    extracted_data = []

    for item in data:
        uri = item.get("uri", "")
        labelled_answer_spans = item.get("labelled_answer_spans", {})
        labelled_summaries = item.get("labelled_summaries", {})

        for perspective, texts in labelled_answer_spans.items():
            context = " ".join([entry["txt"] for entry in texts])  # Combine all text from a perspective
            summary = labelled_summaries.get(f"{perspective}_SUMMARY", "")  # Fetch related summary
            extracted_data.append([uri, perspective, context, summary])

    # Write to CSV
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["URI", "Perspective", "Context", "Summary"])
        writer.writerows(extracted_data)

    print(f"CSV file saved: {csv_file}")

# Example usage
extract_perspective_data("gdrive/My Drive/PerAnsSummvalid.json", "gdrive/My Drive/PerAnsSummvalidOutput.csv")

CSV file saved: gdrive/My Drive/PerAnsSummvalidOutput.csv


In [None]:
import pandas as pd

# Load Training Dataset
train_df = pd.read_csv("gdrive/My Drive/PerAnsSummtrainOutput.csv", names=["URI",	"Perspective",	"Context",	"Summary"])

# Load Validation Dataset
valid_df = pd.read_csv("gdrive/My Drive/PerAnsSummvalidOutput.csv", names=["URI",	"Perspective",	"Context",	"Summary"])

# Combine Training & Validation Data
combined_df = pd.concat([train_df, valid_df], ignore_index=True)

# Check the dataset size
print(f"Total Samples: {combined_df.shape[0]}")
print(combined_df.head())  # Preview data

# Save the combined dataset (Optional)
combined_df.to_csv("gdrive/My Drive/PerAnsSummOutput_Combined.csv", index=False)

# Now, use `combined_df` for training instead of `labeled_df`


Total Samples: 6286
       URI  Perspective                                            Context  \
0      URI  Perspective                                            Context   
1  4367393  INFORMATION  Parkinson's disease is one of the most common ...   
2  1504599   SUGGESTION  duck tape e tell your husband to record you on...   
3  1504599        CAUSE  hink that you have a stress on your daily life...   
4  1504599  INFORMATION  magnesium at night.  It's a muscle and mind re...   

                                             Summary  
0                                            Summary  
1  Parkinson's disease is a prevalent neurologic ...  
2  It is suggested to use duct tape to record you...  
3  Shouting during sleep may be due to bad sleep ...  
4  Magnesium is a muscle and mind relaxer which c...  


In [None]:
!pip install transformers torch




In [None]:
#try this too
import json
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# Load BART model for summarization
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load sentence embedding model for similarity search
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Load training dataset
train_df = pd.read_csv("gdrive/My Drive/training_dataset.csv", delimiter="\t")

# Convert training data into a list of (context, summary) pairs
training_texts = train_df["Context"].tolist()
training_summaries = train_df["Summary"].tolist()
training_embeddings = embedder.encode(training_texts, convert_to_tensor=True)

# Load test dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Load perspectives classified by SVM
with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as f:
    classified_perspectives = json.load(f)

# Create a lookup dictionary for test data
test_lookup = {entry["uri"]: entry for entry in test_data}


# Function to retrieve similar examples from training data
def get_few_shot_examples(test_text, num_examples=3):
    test_embedding = embedder.encode(test_text, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(test_embedding, training_embeddings)[0]
    top_indices = torch.topk(cos_scores, num_examples).indices.tolist()

    examples = []
    for idx in top_indices:
        examples.append(f"Input: {training_texts[idx]}\nSummary: {training_summaries[idx]}\n")

    return "\n".join(examples)


# Function to generate extractive summary with weak supervision
def generate_extractive_summary(text):
    if not text:
        return ""

    # Get few-shot learning examples
    few_shot_examples = get_few_shot_examples(text)

    # Construct enhanced prompt
    prompt = (
        f"Here are examples of good summaries:\n{few_shot_examples}\n"
        f"Now summarize:\nInput: {text}\nSummary:"
    )

    # Dynamically adjust max and min length based on input size
    input_length = len(text.split())
    max_length = min(250, max(50, input_length // 3))  # Cap at 250, min 50
    min_length = max(25, max_length // 2)

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=1.5,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# Process test dataset with weak supervision
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri in test_lookup:
        question = test_lookup[uri].get("question", "")
        context = test_lookup[uri].get("context", "")

        # Generate summaries for each perspective
        summaries = {category: generate_extractive_summary(" ".join(texts)) for category, texts in spans.items()}

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "summaries": summaries
        }
        output_data.append(output_entry)

# Save results
output_path = "gdrive/My Drive/final_extractive_summarized_weak_supervision.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print(f"Extractive summarization with weak supervision completed! Output saved to {output_path}")


In [None]:
#try this too:
import json
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load optimized model for extractive summarization
model_name = "sshleifer/distilbart-cnn-12-6"  # Faster and efficient summarization model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load CSV datasets
train_df = pd.read_csv("gdrive/My Drive/training_dataset.csv", delimiter="\t")
test_df = pd.read_csv("gdrive/My Drive/testing_dataset.csv", delimiter="\t")

# Convert dataset into a structured format
dataset = {}
for _, row in train_df.iterrows():
    uri = str(row["URI"]).strip()
    perspective = row["Perspective"].strip()
    context = row["Context"].strip()

    if uri not in dataset:
        dataset[uri] = {}

    if perspective not in dataset[uri]:
        dataset[uri][perspective] = []

    dataset[uri][perspective].append(context)

# Function to generate an extractive summary with dynamic length adjustment
def generate_extractive_summary(text):
    if not text or len(text.strip()) == 0:
        return ""

    # Dynamically adjust summary length based on input text length
    input_length = len(text.split())  # Count words
    max_length = min(200, max(50, input_length // 3))  # Cap at 200 words, min 50
    min_length = max(25, max_length // 2)  # Ensure summary has a reasonable lower bound

    inputs = tokenizer.encode(
        "summarize: " + text, return_tensors="pt", truncation=True, max_length=1024
    ).to(device)

    summary_ids = model.generate(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=1.5,  # Balance between concise & informative
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Process dataset and generate summaries for testing set
output_data = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating Summaries"):
    uri = str(row["URI"]).strip()
    perspective = row["Perspective"].strip()

    # If URI & perspective exist in the training dataset, generate summary
    if uri in dataset and perspective in dataset[uri]:
        context_text = " ".join(dataset[uri][perspective])  # Combine related texts
        summary = generate_extractive_summary(context_text)
    else:
        summary = "No relevant training data available."

    output_entry = {
        "uri": uri,
        "perspective": perspective,
        "generated_summary": summary
    }

    output_data.append(output_entry)

# Save the final summarized output
with open("gdrive/My Drive/final_extractive_summarized_output.json", "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print("Extractive summarization completed successfully!")

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Choose model dynamically
MODEL_NAME = "allenai/led-large-16384"  # Can be changed to "google/pegasus-xsum" or "allenai/led-large-16384"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Function to generate extractive summary
def generate_extractive_summary(text_list):
    if not text_list:
        return ""

    full_text = " ".join(text_list)
    inputs = tokenizer.encode("summarize: " + full_text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs, max_length=200, min_length=50, length_penalty=2.0, num_beams=6, early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Load dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as file:
    data_file = json.load(file)

with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as file:
    classified_perspectives = json.load(file)

data_lookup = {entry["uri"]: entry for entry in data_file}

# Process entries
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri in data_lookup:
        question = data_lookup[uri]["question"]
        context = data_lookup[uri]["context"]

        summaries = {category: generate_extractive_summary(texts) for category, texts in spans.items()}

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "summaries": summaries
        }
        output_data.append(output_entry)

# Save results
output_path = "gdrive/My Drive/final_extractive_summarized_outputnow.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print(f"Extractive summarization completed! Saved to {output_path}")


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Input ids are automatically padded from 150 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 93 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 20 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 221 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 34 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 60 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 81 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 334 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 30 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 102 to 1024 to be a multiple o

Extractive summarization completed! Saved to gdrive/My Drive/final_extractive_summarized_outputnow.json


In [None]:
#new:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BART extractive summarization model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to determine dynamic summarization length
def get_dynamic_lengths(text):
    """Dynamically set max & min summary lengths based on input text length."""
    words = len(text.split())

    max_length = min(200, int(words * 0.5))  # Max summary is up to 50% of original
    min_length = max(30, int(words * 0.2))  # Min summary is at least 20% of original
    return max_length, min_length

# Function to generate extractive summary with dynamic length control
def generate_extractive_summary(texts):
    if not texts:
        return ""

    full_text = " ".join(texts)
    max_length, min_length = get_dynamic_lengths(full_text)

    inputs = tokenizer(
        "summarize: " + full_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding="max_length"
    )

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=1.2,  # Encourage concise summaries
        num_beams=6,  # Increase beams for diversity
        repetition_penalty=1.1,  # Reduce redundancy
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Load datasets efficiently
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data_file = json.load(f)

with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as f:
    classified_perspectives = json.load(f)

# Create a lookup dictionary for faster access
data_lookup = {entry["uri"]: entry for entry in data_file}

# Process entries with optimized lookups
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry.get("spans", {})

    # Fetch context efficiently
    context_data = data_lookup.get(uri, {})
    question = context_data.get("question", "")
    context = context_data.get("context", "")

    # Generate summaries for each perspective
    summaries = {category: generate_extractive_summary(texts) for category, texts in spans.items()}

    output_entry = {
        "uri": uri,
        "question": question,
        "context": context,
        "spans": spans,
        "summaries": summaries
    }
    output_data.append(output_entry)

# Save results
output_path = "gdrive/My Drive/final_extractive_summarized_SVM.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print(f"✅ Extractive summarization completed! Output saved to {output_path}")


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BERTSUM extractive summarization model
model_name = "facebook/bart-large-cnn"  # BART-based extractive model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to generate extractive summary
def generate_extractive_summary(text_list):
    if not text_list:
        return ""

    full_text = " ".join(text_list)
    inputs = tokenizer.encode("summarize: " + full_text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Load dataset
data_file = json.load(open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8"))
classified_perspectives = json.load(open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8"))

data_lookup = {entry["uri"]: entry for entry in data_file}

# Process entries
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri in data_lookup:
        question = data_lookup[uri]["question"]
        context = data_lookup[uri]["context"]

        summaries = {category: generate_extractive_summary(texts) for category, texts in spans.items()}

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "summaries": summaries
        }
        output_data.append(output_entry)

# Save results
with open("gdrive/My Drive/final_extractive_summarized_outputnow.json", "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print("Extractive summarization completed!")

In [None]:
import json
import concurrent.futures
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BART-based extractive summarization model
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def chunk_text(text, max_tokens=1024):
    """Splits text into manageable chunks to prevent truncation."""
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i : i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

def generate_extractive_summary(text_list):
    """Generate extractive summaries using a BART model."""
    if not text_list:
        return ""

    full_text = " ".join(text_list)
    text_chunks = chunk_text(full_text)  # Split into chunks

    summaries = []
    for chunk in text_chunks:
        inputs = tokenizer.encode("summarize: " + chunk, return_tensors="pt", truncation=True, max_length=1024)
        summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    return " ".join(summaries)

# Load dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data_file = json.load(f)

with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as f:
    classified_perspectives = json.load(f)

# Convert data into a dictionary for quick lookup
data_lookup = {entry["uri"]: entry for entry in data_file}

output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri not in data_lookup:
        print(f"⚠️ Warning: URI {uri} not found in dataset. Skipping.")
        continue  # Skip missing entries

    question = data_lookup[uri]["question"]
    context = data_lookup[uri]["context"]

    # Parallelized summarization
    with concurrent.futures.ThreadPoolExecutor() as executor:
        summaries = dict(zip(spans.keys(), executor.map(generate_extractive_summary, spans.values())))

    output_entry = {
        "uri": uri,
        "question": question,
        "context": context,
        "spans": spans,
        "summaries": summaries
    }
    output_data.append(output_entry)

# Save results efficiently
with open("gdrive/My Drive/final_extractive_summarized_SVM2b.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=4, ensure_ascii=False)

print("✅ Extractive summarization completed successfully!")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

RuntimeError: Already borrowed

In [None]:
#Final COde
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load BERTSUM extractive summarization model
model_name = "facebook/bart-large-cnn"  # BART-based extractive model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to generate extractive summary
def generate_extractive_summary(text_list):
    if not text_list:
        return ""

    full_text = " ".join(text_list)
    inputs = tokenizer.encode("summarize: " + full_text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Load dataset
data_file = json.load(open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8"))
classified_perspectives = json.load(open("gdrive/My Drive/classified_perspectives_improvedSVM2b.json", "r", encoding="utf-8"))

data_lookup = {entry["uri"]: entry for entry in data_file}

# Process entries
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri in data_lookup:
        question = data_lookup[uri]["question"]
        context = data_lookup[uri]["context"]

        summaries = {category: generate_extractive_summary(texts) for category, texts in spans.items()}

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "summaries": summaries
        }
        output_data.append(output_entry)

# Save results
with open("gdrive/My Drive/final_extractive_summarized_SVM2b.json", "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print("Extractive summarization completed!")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Extractive summarization completed!


In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load PEGASUS summarization model
model_name = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate extractive summary safely
def generate_extractive_summary(text_list):
    if not text_list or not any(text_list):  # Ensure non-empty input
        return ""

    full_text = " ".join(text_list).strip()
    if not full_text:
        return ""

    # Tokenization with truncation
    inputs = tokenizer(
        full_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding="max_length"
    ).to(device)

    try:
        with torch.no_grad():  # Disable gradients for inference
            summary_ids = model.generate(
                inputs["input_ids"],
                max_length=150,
                min_length=50,
                length_penalty=2.0,
                num_beams=4,
                early_stopping=True
            )
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    except Exception as e:
        print(f"❌ Error in summarization: {e}")
        return ""

# Load datasets safely
try:
    with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
        data_file = json.load(f)

    with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as f:
        classified_perspectives = json.load(f)

except Exception as e:
    print(f"❌ Error loading JSON files: {e}")
    exit()

# Create a lookup dictionary
data_lookup = {entry.get("uri"): entry for entry in data_file}

# Process entries
output_data = []
for entry in classified_perspectives:
    uri = entry.get("uri")
    spans = entry.get("spans", {})

    if uri in data_lookup:
        question = data_lookup[uri].get("question", "")
        context = data_lookup[uri].get("context", "")

        # Generate summaries only for non-empty spans
        summaries = {
            category: generate_extractive_summary(texts) if texts else ""
            for category, texts in spans.items()
        }

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "summaries": summaries
        }
        output_data.append(output_entry)

# Save results safely
output_path = "gdrive/My Drive/final_extractive_summarized_SVM2f.json"
try:
    with open(output_path, "w", encoding="utf-8") as output_file:
        json.dump(output_data, output_file, indent=4)
    print(f"✅ Extractive summarization completed! Output saved to {output_path}")
except Exception as e:
    print(f"❌ Error saving output: {e}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range in self
❌ Error in summarization: index out of range i

In [None]:
import json

# Define the mapping of categories to their respective phrases
phrases = {
    "EXPERIENCE": "In user's experience, ",
    "SUGGESTION": "It is suggested, ",
    "INFORMATION": "For information purposes, ",
    "CAUSE": "Some of the causes, ",
    "QUESTION": "It is inquired, "
}

# Load the JSON file
with open("gdrive/My Drive/final_hybrid_summarized_output.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Iterate over each entry in the JSON file
for entry in data:
    if "extractive_summaries" in entry:
        for key, prefix in phrases.items():
            if key in entry["extractive_summaries"] and entry["extractive_summaries"][key]:
                entry["extractive_summaries"][key] = prefix + entry["extractive_summaries"][key]

# Save the updated JSON file
with open("gdrive/My Drive/final_hybrid_summarized_outputt.json", "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print("Summaries updated successfully.")


Summaries updated successfully.


In [None]:
import json

# Load JSON file
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

final_summarized_output = load_json("gdrive/My Drive/final_hybrid_summarized_outputt.json")

# Process each entry to match the required format
output_data = []
for entry in final_summarized_output:
    uri = entry["uri"]
    spans = {category: [" ".join(value.split()) for value in values] for category, values in entry["spans"].items()}
    extractive_summaries = {category: " ".join(summary.split()) for category, summary in entry.get("extractive_summaries", {}).items()}

    formatted_entry = {
        "uri": uri,
        "spans": spans,
        "summaries": extractive_summaries
    }
    output_data.append(formatted_entry)

# Save the reformatted JSON output
with open("gdrive/My Drive/final_hybrid_summarized_outputtt.json", "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print("Reformatting completed. Output saved as formatted_output.json")


Reformatting completed. Output saved as formatted_output.json


In [None]:
import re
import json
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from snorkel.labeling import LabelingFunction, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load labeled dataset
labeled_df = pd.read_csv("gdrive/My Drive/PerAnsSumm_Combined.csv", names=["uri", "text", "label"])

# Define perspective categories and map them to integers
PERSPECTIVES = {label: i for i, label in enumerate(["EXPERIENCE", "INFORMATION", "CAUSE", "SUGGESTION", "QUESTION"])}
PERSPECTIVE_NAMES = list(PERSPECTIVES.keys())  # ["EXPERIENCE", "INFORMATION", ...]

# Load sentence embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert dataset text to embeddings
X_embeddings = embedding_model.encode(labeled_df["text"].tolist(), convert_to_numpy=True)

# Convert string labels to integer labels
y_train = labeled_df["label"].map(PERSPECTIVES)  # Ensure labels are integer-based

# Train an improved SVM classifier
svm_classifier = SVC(kernel="linear", probability=True)
svm_classifier.fit(X_embeddings, y_train)

# Generate TF-IDF-based regex patterns
vectorizer = TfidfVectorizer(max_features=10, stop_words="english")
X_tfidf = vectorizer.fit_transform(labeled_df["text"])
tfidf_keywords = vectorizer.get_feature_names_out()

regex_patterns = {label: " | ".join(tfidf_keywords) for label in PERSPECTIVES}

# Snorkel Labeling Functions
def create_regex_lf(label, pattern):
    """Create a Snorkel labeling function using regex-based weak supervision."""
    def lf(x):
        text = str(x.text) if isinstance(x.text, (str, bytes)) else ""
        return PERSPECTIVES[label] if re.search(pattern, text, re.IGNORECASE) else -1
    return LabelingFunction(name=f"lf_regex_{label}", f=lf)

def svm_lf(x):
    """Snorkel Labeling Function using SVM predictions."""
    embedding = embedding_model.encode([x.text], convert_to_numpy=True)
    prediction = svm_classifier.predict(embedding)[0]  # This is an integer
    return prediction if prediction in PERSPECTIVES.values() else -1

# Create Snorkel labeling functions
labeling_functions = [create_regex_lf(label, regex_patterns[label]) for label in PERSPECTIVES]
labeling_functions.append(LabelingFunction(name="lf_svm", f=svm_lf))

# Apply Snorkel weak supervision
applier = PandasLFApplier(lfs=labeling_functions)
L_train = applier.apply(df=labeled_df[["text"]])  # Ensure correct column format

# Train Snorkel Label Model
label_model = LabelModel(cardinality=len(PERSPECTIVES), verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100)

# Load BART Zero-Shot Classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def zero_shot_classify(text):
    """Classify text using BART when Snorkel and SVM fail."""
    result = classifier(text, PERSPECTIVE_NAMES)
    best_label = result["labels"][0]  # This is a string
    return PERSPECTIVES.get(best_label, -1)  # Convert to integer

# Load unlabeled dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Process dataset and classify answers
results = []

for entry in data:
    uri = entry["uri"]
    categorized_spans = {perspective: [] for perspective in PERSPECTIVE_NAMES}

    for ans in entry["answers"]:
        sentences = re.split(r"(?<=[.!?])\s+", ans.strip())  # Split into sentences

        # Apply Snorkel Labeling
        L_test = applier.apply(pd.DataFrame({"text": sentences}))
        weak_labels = label_model.predict(L=L_test)

        # Apply SVM for refined classification using sentence embeddings
        X_test_embeddings = embedding_model.encode(sentences, convert_to_numpy=True)
        svm_preds = svm_classifier.predict(X_test_embeddings)

        for i, sentence in enumerate(sentences):
            weak_label = weak_labels[i]
            if weak_label != -1:
                final_label = PERSPECTIVE_NAMES[weak_label]  # Use Snorkel if available
            else:
                final_label = PERSPECTIVE_NAMES[svm_preds[i]]  # Otherwise, use SVM

            # Use Zero-Shot only if both Snorkel & SVM fail
            if final_label not in PERSPECTIVE_NAMES:
                final_label = zero_shot_classify(sentence)

            categorized_spans[final_label].append(sentence)

    results.append({"uri": uri, "spans": categorized_spans})

# Save results to JSON file
with open("gdrive/My Drive/classified_perspectives_improvedSVMvc.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("✅ Classification completed! Results saved to classified_perspectives_improvedSVM.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ValueError: Input y contains NaN.

In [None]:
!pip install transformers sentence-transformers torch pandas


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:

import pandas as pd

# Load CSV file correctly
labeled_df = pd.read_csv("gdrive/My Drive/PerAnsSummOutput_Combined.csv", names=["URI",	"Perspective",	"Context",	"Summary"], header=0)

# Display first few rows
labeled_df.head()

Unnamed: 0,URI,Perspective,Context,Summary
0,URI,Perspective,Context,Summary
1,4367393,INFORMATION,Parkinson's disease is one of the most common ...,Parkinson's disease is a prevalent neurologic ...
2,1504599,SUGGESTION,duck tape e tell your husband to record you on...,It is suggested to use duct tape to record you...
3,1504599,CAUSE,hink that you have a stress on your daily life...,Shouting during sleep may be due to bad sleep ...
4,1504599,INFORMATION,magnesium at night. It's a muscle and mind re...,Magnesium is a muscle and mind relaxer which c...


In [None]:
import json
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# Load BART model for summarization
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load sentence embedding model for similarity search
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Load training dataset
train_df = pd.read_csv("gdrive/My Drive/PerAnsSummOutput_Combined.csv", names=["URI",	"Perspective",	"Context",	"Summary"], header=0)

# Convert training data into a list of (context, summary) pairs
training_texts = train_df["Context"].tolist()
training_summaries = train_df["Summary"].tolist()
training_embeddings = embedder.encode(training_texts, convert_to_tensor=True)

# Load test dataset
with open("gdrive/My Drive/test_no_label.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Load perspectives classified by SVM
with open("gdrive/My Drive/classified_perspectives_improvedSVM.json", "r", encoding="utf-8") as f:
    classified_perspectives = json.load(f)

# Create a lookup dictionary for test data
test_lookup = {entry["uri"]: entry for entry in test_data}


# Function to retrieve similar examples from training data
def get_few_shot_examples(test_text, num_examples=3):
    test_embedding = embedder.encode(test_text, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(test_embedding, training_embeddings)[0]
    top_indices = torch.topk(cos_scores, num_examples).indices.tolist()

    examples = []
    for idx in top_indices:
        examples.append(f"Input: {training_texts[idx]}\nSummary: {training_summaries[idx]}\n")

    return "\n".join(examples)


# Function to generate extractive summary with weak supervision
def generate_extractive_summary(text):
    if not text:
        return ""

    # Get few-shot learning examples
    few_shot_examples = get_few_shot_examples(text)

    # Construct enhanced prompt
    prompt = (
        f"Here are examples of good summaries:\n{few_shot_examples}\n"
        f"Now summarize:\nInput: {text}\nSummary:"
    )

    # Dynamically adjust max and min length based on input size
    input_length = len(text.split())
    max_length = min(250, max(50, input_length // 3))  # Cap at 250, min 50
    min_length = max(25, max_length // 2)

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(model.device)

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=1.5,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# Process test dataset with weak supervision
output_data = []
for entry in classified_perspectives:
    uri = entry["uri"]
    spans = entry["spans"]

    if uri in test_lookup:
        question = test_lookup[uri].get("question", "")
        context = test_lookup[uri].get("context", "")

        # Generate summaries for each perspective
        summaries = {category: generate_extractive_summary(" ".join(texts)) for category, texts in spans.items()}

        output_entry = {
            "uri": uri,
            "question": question,
            "context": context,
            "spans": spans,
            "summaries": summaries
        }
        output_data.append(output_entry)

# Save results
output_path = "gdrive/My Drive/final_extractive_summarized_weak_supervision.json"
with open(output_path, "w", encoding="utf-8") as output_file:
    json.dump(output_data, output_file, indent=4)

print(f"Extractive summarization with weak supervision completed! Output saved to {output_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Extractive summarization with weak supervision completed! Output saved to gdrive/My Drive/final_extractive_summarized_weak_supervision.json
