In [4]:
import pandas as pd
import numpy as np
import joblib
import re
import spacy
from collections import Counter
from textstat import flesch_reading_ease

# 1. CARICAMENTO ASSET E MODELLI
print("üîÑ Caricamento in corso...")
nlp = spacy.load("en_core_web_sm")
model = joblib.load('best_model_Random_Forest.pkl')
scaler = joblib.load('scaler.pkl')

def extract_live_features(text):
    """
    Estrae le feature esattamente come nel tuo signature.ipynb,
    includendo le nuove firme 'Hapax' e 'Template'.
    """
    doc = nlp(text)
    words = [t.text.lower() for t in doc if not t.is_punct and not t.is_space]
    sentences = list(doc.sents)
    
    # --- Feature Standard (dal tuo modeling.ipynb) ---
    avg_word_len = np.mean([len(w) for w in words]) if words else 0
    lexical_diversity = len(set(words)) / len(words) if words else 0
    sent_lengths = [len([t for t in s if not t.is_punct]) for s in sentences]
    sent_len_std = np.std(sent_lengths) if len(sent_lengths) > 1 else 0
    
    # --- Nuove Firme Avanzate ---
    # Hapax Density
    word_counts = Counter(words)
    hapax_count = sum(1 for w in word_counts if word_counts[w] == 1)
    hapax_density = hapax_count / len(words) if words else 0
    
    # Template Bias Score
    t_score = 0
    if any(re.search(p, text, re.MULTILINE) for p in [r'^\s*[\-\*‚Ä¢]\s+', r'^\s*\d+[\.\)]\s+']):
        t_score += 1.5
    if any(p in text.lower() for p in ['in conclusion', 'overall', 'to summarize']):
        t_score += 1.2
        
    # --- Altre feature presenti nel tuo dataset ---
    flesch = flesch_reading_ease(text)
    first_person = len(re.findall(r'\b(i|my|me|mine|we|our)\b', text.lower()))
    
    # CREAZIONE DATAFRAME (L'ordine deve essere identico a quello del training!)
    # Nota: Assicurati che i nomi qui sotto siano identici a quelli in 'feature_list.txt'
    feat_dict = {
        'avg_word_length': avg_word_len,
        'lexical_diversity': lexical_diversity,
        'sentence_length_variability': sent_len_std,
        'hapax_density': hapax_density,
        'template_bias_score': t_score,
        'readability_score': flesch,
        'first_person_count': first_person
    }
    
    return pd.DataFrame([feat_dict])

def predict_text(text):
    # Estrazione
    features_df = extract_live_features(text)
    
    # Scaling
    features_scaled = scaler.transform(features_df)
    
    # Predizione probabilit√†
    prob_ai = model.predict_proba(features_scaled)[0][1]
    
    print("\n" + "="*50)
    print("üî¨ DIAGNOSI STILOMETRICA")
    print("="*50)
    print(f"Probabilit√† IA: {prob_ai*100:.2f}%")
    
    if prob_ai > 0.5:
        print("\nRISULTATO: ü§ñ Testo generato da IA")
        print("Sospetti principali: Ritmo troppo regolare o vocabolario poco vario.")
    else:
        print("\nRISULTATO: ‚úçÔ∏è Testo scritto da un Umano")
        print("Segni distintivi: Imperfezioni naturali e alta ricchezza lessicale.")
    print("="*50)

# --- PROVA QUI ---
testo = 'ciao a tutti'
predict_text(testo)

üîÑ Caricamento in corso...


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- hapax_density
- template_bias_score
Feature names seen at fit time, yet now missing:
- avg_sentence_length
- burstiness_index
- clause_density
- contraction_density
- dependency_depth_mean
- ...
