In [None]:
# Import the necessary module
from google.colab import drive
from datasets import load_from_disk

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# List the contents of the specific directory (optional, but good for debugging)
!ls /content/drive/MyDrive/my_dataset/

# Load the dataset from disk *after* mounting Google Drive
dataset = load_from_disk('file:///content/drive/MyDrive/my_dataset')

In [None]:
import numpy as np
import pandas as pd
train_ds = pd.DataFrame(dataset['train'])
test_ds = pd.DataFrame(dataset['test'])

In [None]:
train_ds = train_ds.head(500)
test_ds = test_ds.head(200)

In [None]:
from transformers import pipeline

In [None]:
# Load pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)  # CPU

# Make sure documents are strings
documents = train_ds['document_lemmatized_n'].apply(lambda tokens: ' '.join(tokens)).tolist()

# Truncate overly long documents
def truncate_text(text, max_words=900):
    words = text.split()
    return ' '.join(words[:max_words]) if len(words) > max_words else text

# Generate summaries
summaries = []
for i, doc in enumerate(documents):
    doc = truncate_text(doc)
    try:
        summary = summarizer(doc, max_length=60, min_length=20, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    except Exception as e:
        print(f"Error on document {i}: {e}")
        summaries.append("Summary generation failed.")

In [None]:
# Store summaries in the dataset:
train_ds['doc_summary'] = summaries

In [None]:
train_ds

In [None]:
from datasets import Dataset, DatasetDict
# assign the splits
train = Dataset.from_pandas(train_ds)
test = Dataset.from_pandas(test_ds)
# reconstruct both datasets into a Dataset Dict object
model_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
model_ds

In [None]:
model_ds.save_to_disk('/content/model_ds')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/model_ds /content/drive/MyDrive/model_ds