In [None]:
!pip install ipywidgets


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [None]:
from IPython.display import display
import ipywidgets as widgets


In [None]:

!pip install torch transformers accelerate sentencepiece
!pip install bitsandbytes

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [None]:
from huggingface_hub import login
login(token=" ")


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import textwrap
import re
from google.colab import files

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Function to read a text
def load_text_file(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
        print(f"Loaded file: {filename}")
        return text
    except FileNotFoundError:
        print(f"File '{filename}' not found in the working directory.")
        return None

# Function to chunk long text into smaller pieces
def chunk_text(text, tokenizer, max_tokens=3500, prompt_token_buffer=500):
    paragraphs = text.split('\n\n')
    chunks = []
    current_chunk = ""

    for paragraph in paragraphs:
        test_chunk = current_chunk + paragraph + "\n\n"
        tokenized = tokenizer(test_chunk, return_tensors="pt", truncation=False)
        num_tokens = tokenized.input_ids.shape[1]

        if num_tokens <= max_tokens - prompt_token_buffer:
            current_chunk = test_chunk
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    print(f"Text divided into {len(chunks)} chunks (max {max_tokens - prompt_token_buffer} + buffer for prompt tokens)")
    return chunks



# Function to run inference with model on chunked text
def process_text_with_model(model_id, text, task_type="summarize", max_new_tokens=300):
    print(f"\nLoading model: {model_id}")

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto",
            load_in_8bit=True,
            llm_int8_enable_fp32_cpu_offload=True
        )
        print("Model loaded with 8-bit + CPU offload.")
    except ValueError as e:
        print(f"\n Warning during model load: {e}")
        print("Falling back to CPU with full precision...")
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map={"": "cpu"},
            torch_dtype=torch.float32
        )
        print("Model loaded on CPU with full precision.")

    chunks = chunk_text(text, tokenizer, max_tokens=4096, prompt_token_buffer=500)
    results = []

    for i, chunk in enumerate(chunks):
        print(f"\nProcessing chunk {i+1}/{len(chunks)}...")

        if task_type == "summarize":
            prompt = f"Riassumi il seguente testo:\n\n{chunk}\n\nRiassunto:"
        elif task_type == "pro_drop_analysis":
            prompt = f"""
            Analizza il seguente testo ed esplicita tutti i soggetti quando sono omessi:

            {chunk}

            Testo con soggetti esplicitati:
            """
        elif task_type == "coreference":
            prompt = f"""
            Analizza il seguente testo e identifica tutti i riferimenti anaforici:

            {chunk}

            Analisi dei riferimenti:
            """
        else:
            raise ValueError("Unknown task type")

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )

        response = tokenizer.decode(output[0], skip_special_tokens=True)
        response_only = response[len(prompt):] if response.startswith(prompt) else response
        results.append(response_only.strip())

        print(f"Response preview: {response_only[:150]}...")

    return "\n\n".join(results)

# Function to save output file
def save_results(model_name, task_type, results):
    filename = f"{model_name}_{task_type}_results.txt"
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(results)
    print(f"\nResults saved to {filename}")
    files.download(filename)

# Main execution
def main():
    input_text = load_text_file("una_questione_privata_copia.txt")

    if not input_text:
        print("Text could not be loaded.")
        return

    print(f"Loaded text length: {len(input_text)} characters")

    models = {
        #"1": "iaslab/Cerbero-7b",
        #"2": "togethercomputer/Vitruvian-Italian-7B"
        "1": "cerbero-7b-openchat",
        "2": "sapienzanlp/Minerva-7B-instruct-v1.0"
    }

    tasks = {
        "1": "summarize",
        "2": "pro_drop_analysis",
        "3": "coreference"
    }

    print("\nAvailable models:")
    for key, model in models.items():
        print(f"{key}: {model}")

    model_choice = input("\nSelect model number: ")
    selected_model = models.get(model_choice)

    if not selected_model:
        print("Invalid model selection.")
        return

    print("\nAvailable tasks:")
    print("1: Summarize text")
    print("2: Analyze pro-drop (identify omitted subjects)")
    print("3: Analyze coreference mechanisms")

    task_choice = input("\nSelect task number: ")
    selected_task = tasks.get(task_choice)

    if not selected_task:
        print("Invalid task selection.")
        return

    model_name = selected_model.split("/")[-1]
    results = process_text_with_model(selected_model, input_text, selected_task)
    save_results(model_name, selected_task, results)

# Run main
main()

Using device: cuda
Loaded file: una_questione_privata_copia.txt
Loaded text length: 231783 characters

Available models:
1: iaslab/Cerbero-7b
2: sapienzanlp/Minerva-7B-instruct-v1.0

Select model number: 2

Available tasks:
1: Summarize text
2: Analyze pro-drop (identify omitted subjects)
3: Analyze coreference mechanisms

Select task number: 1

Loading model: sapienzanlp/Minerva-7B-instruct-v1.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (7532 > 4096). Running this sequence through the model will result in indexing errors


✅ Model loaded with 8-bit + CPU offload.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Text divided into 17 chunks (max 3596 + buffer for prompt tokens)

Processing chunk 1/17...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Response preview:  Riassumi il seguente testo:

Una questione privata
Beppe Fenoglio


I

La bocca socchiusa, le braccia abbandonate lungo i
fianchi, Milton guardava la...

Processing chunk 2/17...


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Response preview:  Riassumi il seguente testo:

La custode spiò dall’angolo. – Un partigiano! Cosa
vuole? Chi cerca? Ma lei è...
– Sono proprio io, – disse Milton senza...

Processing chunk 3/17...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Response preview:  Riassumi il seguente testo:

III

Riassunto:
Il testo descrive l'impatto dell'immigrazione sulla società e sulle economie dei paesi ospitanti. L'auto...

Processing chunk 4/17...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Response preview:  Riassumi il seguente testo:

Rientrarono a Treiso verso le sei. La strada sfumava
sotto i loro piedi e gli ultimi chiarori sembravano con-centrarsi i...

Processing chunk 5/17...


In [None]:
import gc
import torch

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
