## SetFit model training using the created JSON file 
### 'breaking_bad_analysisV2.json' from (M2_LLM_Data_Fetch_and_Processing.ipynb)

In [None]:
# Install required packages from requirements.txt
!pip install -r requirements.txt -q

# import libs
import json
import pandas as pd
from datasets import Dataset
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import classification_report

def load_and_prepare_data():
    """Load JSON and prepare data for classification"""
    with open("breaking_bad_analysisV2.json", 'r', encoding='utf-8') as file:
        data = json.load(file)

    relationships = []
    labels = []

    for episode in data.values():
        for rel in episode.get('relationships', []):
            relationships.append(f"{rel['source']} - {rel['target']}")
            labels.append(rel['relation'])

    # Split into train/test (80/20)
    df = pd.DataFrame({'text': relationships, 'label': labels})
    train_size = int(len(df) * 0.8)

    train_data = Dataset.from_pandas(df[:train_size])
    test_data = Dataset.from_pandas(df[train_size:])

    return train_data, test_data

def train_and_evaluate():
    # Load and prepare data
    train_dataset, test_dataset = load_and_prepare_data()

    print(f"Training samples: {len(train_dataset)}")
    print(f"Testing samples: {len(test_dataset)}")

    # Initialize and train model
    model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")
    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_dataset,
        batch_size=16,
        num_iterations=20,
        num_epochs=1
    )

    trainer.train()

    # Evaluate
    predictions = model.predict(test_dataset['text'])
    print("\nClassification Report:")
    print(classification_report(test_dataset['label'], predictions))

    # Example predictions
    print("\nExample Predictions:")
    for text, true_label, pred_label in zip(
        test_dataset['text'][:3],
        test_dataset['label'][:3],
        predictions[:3]
    ):
        print(f"\nText: {text}")
        print(f"True: {true_label}")
        print(f"Predicted: {pred_label}")

    # Save Model
    model.save_pretrained("saved_model")
    
    return model

# run train & evaluation
model = train_and_evaluate()