In [21]:
import joblib

In [24]:
import torch

In [29]:

import joblib
import numpy as np
import textstat
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Carica i componenti salvati (assicurati che i percorsi siano corretti)
SCALER_PATH = 'pkl/feature_scaler.pkl'
MODEL_PATH = 'bert_ai_detector/' # La cartella salvata nel notebook ibrido
RF_MODEL_PATH = 'pkl/rf_hybrid_model.pkl' # Se hai usato RF come testa finale

scaler = joblib.load(SCALER_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
bert_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
bert_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [38]:
import re
import collections # for hapax_density calculation
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

def extract_test_signatures(text):
    # Ensure stylometric_features is accessible (it's a global variable)
    global stylometric_features

    # Basic text processing
    words = re.findall(r'\b\w+\b', text.lower())
    sentences = nltk.sent_tokenize(text)

    num_words = len(words)
    num_sentences = len(sentences)

    # Initialize stats dictionary with all 13 features, defaulting to 0.0
    stats = {f: 0.0 for f in stylometric_features}

    if num_words > 0:
        word_lengths = [len(w) for w in words]
        # 5. lexical_compression_ratio (using unique words / total words as a proxy)
        stats['lexical_compression_ratio'] = len(set(words)) / num_words
        # 12. hapax_density
        word_counts = collections.Counter(words)
        hapax_count = sum(1 for word, count in word_counts.items() if count == 1)
        stats['hapax_density'] = hapax_count / num_words

    if num_sentences > 0:
        sentence_word_counts = [len(s.split()) for s in sentences]
        # 1. sentence_length_cv
        if np.mean(sentence_word_counts) > 0:
            stats['sentence_length_cv'] = np.std(sentence_word_counts) / np.mean(sentence_word_counts)

    # Placeholder values for features that require more complex NLP or aren't easily derived
    # without the original feature extraction logic from 'signature.ipynb'.
    # NOTE: These are simplified/dummy values to match the expected feature count for the pre-trained scaler.
    #       For accurate predictions, the full implementation of these features is required.
    # 2. burstiness_index (placeholder)
    stats['burstiness_index'] = 0.1
    # 3. pos_bigram_entropy (placeholder)
    stats['pos_bigram_entropy'] = 0.5
    # 4. dependency_depth_mean (placeholder)
    stats['dependency_depth_mean'] = 1.0
    # 6. function_word_ratio (placeholder)
    stats['function_word_ratio'] = 0.3
    # 7. sentence_similarity_drift (placeholder)
    stats['sentence_similarity_drift'] = 0.2
    # 8. structural_redundancy (placeholder)
    stats['structural_redundancy'] = 0.0
    # 9. sentiment_variance (placeholder)
    stats['sentiment_variance'] = 0.0
    # 10. readability_oscillation (using Flesch Kincaid as a basic readability score, not oscillation)
    stats['readability_oscillation'] = textstat.flesch_kincaid_grade(text) if text else 0.0
    # 11. clause_density (placeholder)
    stats['clause_density'] = 1.0
    # 13. template_bias_score (placeholder)
    stats['template_bias_score'] = 0.0

    # Ensure the order of features matches the training order
    features_list = [stats[f] for f in stylometric_features]

    # Converte in array e scala
    features = np.array(features_list).reshape(1, -1)
    return scaler.transform(features)

In [None]:
def predict_ai_vs_human(text):
    # 1. BERT Output
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
        bert_probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        bert_score = bert_probs[0][1].item()  # Probabilità classe AI

    # 2. Stylometric Signatures
    sig_features = extract_test_signatures(text)

    # 3. Decisione Finale (Logica Ibrida)
    # Se hai un modello Random Forest finale:
    # final_input = np.hstack([sig_features, [[bert_score]]])
    # final_pred = rf_model.predict(final_input)

    # Esempio di output semplice:
    print(f"BERT Confidence (AI): {bert_score:.2%}")
    return "AI" if bert_score > 0.5 else "HUMAN"

# TEST PRATICO
testo_da_testare = "Today I'll go to work and then to school because I like learning"
risultato = predict_ai_vs_human(testo_da_testare)
print(f"Risultato: {risultato}")

BERT Confidence (AI): 5.55%
Risultato: HUMAN
