<a href="https://colab.research.google.com/github/IreneDeNevi/HumanizeText/blob/main/HumanizeText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q transformers accelerate torch python-docx sentencepiece

In [4]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
BATCH_PARAGRAPHS = 4                # quanti paragrafi mandare per richiesta (abbassa se OOM)
MAX_TOKENS_OUT = 800
INSTRUCTION = """
Rewrite the text in a natural, human-like academic English style.
Avoid patterns typical of large language models such as excessive coherence, overly generic transitions, or uniform sentence lengths.
Introduce natural variation in syntax and pacing, while preserving meaning, nuance, and academic rigor.
"""

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt
import math
import os
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if (os.environ.get("CUDA_VISIBLE_DEVICES") or False) else -1)

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

In [1]:
def chunk_paragraphs(paragraph_texts, n):
    for i in range(0, len(paragraph_texts), n):
        yield paragraph_texts[i:i+n]

def humanize_text_batch(texts):
    # costruisce input con istruzione + singoli paragrafi separati da marker
    joined = "\n\n###\n\n".join(texts)
    prompt = f"{INSTRUCTION}\n\n{joined}"
    outputs = pipe(prompt, max_length=MAX_TOKENS_OUT, do_sample=False)
    out_text = outputs[0]['generated_text']
    # spezza in base al marker : se il modello non mantiene marker, fallback semplice
    if "###" in out_text:
        parts = [p.strip() for p in out_text.split("###")]
        # se il numero di parti corrisponde, restituisci; altrimenti fallback
        if len(parts) == len(texts):
            return parts
    # fallback: prova splitting by double newline (si può ancora non corrispondere)
    parts = [p.strip() for p in out_text.split("\n\n") if p.strip()]
    # se non match, ritorna il singolo risultato per ogni input (ridondante ma sicuro)
    if len(parts) != len(texts):
        return [out_text.strip()] * len(texts)
    return parts

def rewrite_docx_preserve_structure(input_path, output_path):
    doc = Document(input_path)
    # Processa paragrafi normali
    all_pars = []
    par_objects = []  # (type, object) per ricollocamento: ('p', paragraph) o ('tbl', table)
    for block in doc.element.body:
        tag = block.tag.split('}')[-1]
        if tag == 'p':  # paragrafo
            # troviamo il corresponding paragraph object
            # il modo più semplice: iterate doc.paragraphs sequenzialmente
            # costruiamo lista e aggiorneremo su Document originale (in-place)
            pass
    # More robust approach: iteriamo paragrafo per paragrafo usando doc.paragraphs e doc.tables separately
    # Nota: per mantenere ordine assoluto con tabelle/immagini complesse ci servirebbe parsing XML.
    # Qui implementiamo approccio pragmatico: riscrivi tutti i paragrafi, riscrivendo anche i testi nelle celle di tabella.
    # Questo preserva stili di paragrafo, tabelle, immagini: non tocchiamo gli oggetti immagine.

    # 1) Raccogli paragrafi principali in ordine
    paragraphs = list(doc.paragraphs)
    paragraphs_texts = [p.text for p in paragraphs]

    # 2) Processa paragrafi in chunk per efficienza
    rewritten_par_texts = []
    for chunk in chunk_paragraphs(paragraphs_texts, BATCH_PARAGRAPHS):
        rew = humanize_text_batch(chunk)
        rewritten_par_texts.extend(rew)

    # 3) Sostituisci testo dei paragrafi: preserva lo stile di paragrafo (p.style), ma semplifica runs
    for p_obj, new_text in zip(paragraphs, rewritten_par_texts):
        # keep paragraph style
        p_style = p_obj.style
        # remove all runs
        for r in list(p_obj.runs):
            # clear text in run
            r.text = ''
        # add single run with new text, and leave paragraph style unchanged
        new_run = p_obj.add_run(new_text)
        # Optionale: match font size from previous paragraph if exists
        try:
            if p_obj.runs and len(p_obj.runs) > 0:
                # try to copy font size from first run (best-effort)
                prev = p_obj.runs[0]
                if prev.font.size:
                    new_run.font.size = prev.font.size
        except Exception:
            pass
        p_obj.style = p_style

    # 4) Processa tutte le tabelle: per ogni cella, riscriviamo il testo mantenendo struttura
    for table in doc.tables:
        # itera righe e celle
        for row in table.rows:
            for cell in row.cells:
                cell_pars = cell.paragraphs
                texts = [cp.text for cp in cell_pars]
                if any(t.strip() for t in texts):
                    # batch per celle piccole: riscrivi ogni paragrafo singolarmente (evitiamo grandi chiamate)
                    rewritten = []
                    for t in texts:
                        if t.strip():
                            out = humanize_text_batch([t])[0]
                        else:
                            out = t
                        rewritten.append(out)
                    # replace texts
                    for cp, newt in zip(cell_pars, rewritten):
                        for r in list(cp.runs):
                            r.text = ''
                        cp.add_run(newt)

    # 5) Header & Footer (best-effort)
    try:
        sections = doc.sections
        for s in sections:
            header = s.header
            for p in header.paragraphs:
                if p.text.strip():
                    newt = humanize_text_batch([p.text])[0]
                    for r in list(p.runs):
                        r.text = ''
                    p.add_run(newt)
            footer = s.footer
            for p in footer.paragraphs:
                if p.text.strip():
                    newt = humanize_text_batch([p.text])[0]
                    for r in list(p.runs):
                        r.text = ''
                    p.add_run(newt)
    except Exception:
        # non tutti i documenti hanno header/footer editabili
        pass

    # Salva output
    doc.save(output_path)
    return output_path







[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/253.0 kB[0m [31m19.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from google.colab import files

uploaded = files.upload()


input_file = list(uploaded.keys())[0]
output_file = 'output_humanized.docx'

# Chiama la funzione
output = rewrite_docx_preserve_structure(input_file, output_file)

# Scarica il file risultante
files.download(output)
