# Featuristic Example: Text Classification

This notebook demonstrates how to use Featuristic to classify text documents based on their content.

It utilizes a dataset of news articles which fall into one of the following categories:
- Related to the war between Russia and Ukraine
- Related to the diplomatic relations between the US and UK

In [None]:
import sys
import os

# add featuristic library to PYTHONPATH
sys.path.append(os.path.abspath("./../"))

from featuristic import FeaturisticClassifier
from featuristic import PromptFeatureDefinition, PromptFeatureConfiguration, extract_features
from featuristic import Distribution

from sklearn.model_selection import train_test_split
import json
import numpy as np
import os

## Setup API variables

Configure your API key, base URL, version, and model

In [None]:
# Replace with your actual API key and endpoint
API_KEY = os.getenv("API_KEY")
API_BASE = os.getenv("API_BASE")
API_VERSION = os.getenv("API_VERSION")
MODEL = os.getenv("MODEL")

## Data Loading and Preparation

Load data from JSON files and prepare training and testing datasets.

In [None]:
def load_data(path):
    with open(path, "r") as f:
        data = f.readlines()
    return [json.loads(d)["text"].strip().replace("\n\n", "\n") for d in data if "text" in json.loads(d)]

In [None]:
# Load the datasets
russia_ukraine = load_data("data/russia_ukraine_2025.jsonl")
ones = np.ones(len(russia_ukraine))

uk_us_relationship = load_data("data/uk_us_relationship.jsonl")
zeros = np.zeros(len(uk_us_relationship))

X = russia_ukraine + uk_us_relationship
y = np.concatenate([ones, zeros])

# Split into training and testing datasets
data_train, data_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# For demonstration purposes, use small samples
data_train = data_train[:10]
y_train = y_train[:10]

data_test = data_test[:10]
y_test = y_test[:10]

print(f"Training samples: {len(data_train)}")
print(f"Testing samples: {len(data_test)}")

# Data Examples

In [None]:
import random

# Get a random example from each class
random_russia_ukraine = random.choice(russia_ukraine)
random_uk_us_relationship = random.choice(uk_us_relationship)

print("==== Example from Russia-Ukraine class ====")
print(random_russia_ukraine)
print("\n\n==== Example from UK-US relationship class ===")
print(random_uk_us_relationship)

## Feature Definition

Define LLM-based features we'll use for classification.

In [None]:
# Create feature configuration
config = PromptFeatureConfiguration(
    api_base=API_BASE,
    api_version=API_VERSION,
    api_key=API_KEY,
    model=MODEL,
    use_cache=True,
)

# Helper function for text proportion features
def as_propotion_of_text(x, text):
    return x/len(text.split())

In [None]:
# Define features for classification
mention_of_war = PromptFeatureDefinition(
    name="mention_of_war",
    prompt="Whether or not the notion of war is mentioned",
    llm_return_type=bool,
    distribution=Distribution.BERNOULLI,
    config=config
)

mention_of_casualties = PromptFeatureDefinition(
    name="mention_of_casualties",
    prompt="Whether or not the notion of casualities are mentioned",
    llm_return_type=bool,
    distribution=Distribution.BERNOULLI,
    config=config
)

mentions_weapons = PromptFeatureDefinition(
    name="mentions_weapons",
    prompt="Whether or not the notion of weapons are mentioned",
    llm_return_type=bool,
    distribution=Distribution.BERNOULLI,
    config=config
)

mentions_US = PromptFeatureDefinition(
    name="mentions_US",
    prompt="A count of references to the United States",
    llm_return_type=int,
    feature_post_callback=as_propotion_of_text,
    distribution=Distribution.GAUSSIAN,
    config=config
)

mentions_Russia = PromptFeatureDefinition(
    name="mentions_Russia",
    prompt="A count of references to Russians, Russia, or a place in Russia",
    llm_return_type=int,
    feature_post_callback=as_propotion_of_text,
    distribution=Distribution.GAUSSIAN,
    config=config
)

mentions_Ukraine = PromptFeatureDefinition(
    name="mentions_Ukraine",
    prompt="A count of references to Ukrainians, Ukraine, or any place in Ukraine",
    llm_return_type=int,
    feature_post_callback=as_propotion_of_text,
    distribution=Distribution.GAUSSIAN,
    config=config
)

mentions_Putin = PromptFeatureDefinition(
    name="mentions_Putin",
    prompt="A count of the references to Vladamir Putin",
    llm_return_type=int,
    feature_post_callback=as_propotion_of_text,
    distribution=Distribution.GAUSSIAN,
    config=config
)

russian_ukraine_theme = PromptFeatureDefinition(
    name="russian_ukraine_theme",
    prompt="Whether or not the theme of the article is about the war between Russia and Ukraine",
    llm_return_type=bool,
    distribution=Distribution.BERNOULLI,
    config=config
)

# Collect all feature definitions
feature_definitions = [
    mention_of_war,
    mention_of_casualties,
    mentions_weapons,
    mentions_US,
    mentions_Russia,
    mentions_Ukraine,
    mentions_Putin,
    russian_ukraine_theme
]

## Classification Model

Create a Featuristic classifier using our feature definitions.

In [None]:
# Initialize the classifier
featuristic_classifier = FeaturisticClassifier(
    distributions=[d.distribution for d in feature_definitions]
)

In [None]:
# Define the main training and testing function
async def train_and_test():
    print("Extracting features from training data...")
    features_train = await extract_features(
        data=data_train,
        feature_definitions=feature_definitions
    )
    
    print("\nTraining classifier...")
    featuristic_classifier.fit(
        features=features_train,
        Y=y_train
    )
    
    print("\nExtracting features from test data...")
    features_test = await extract_features(
        data=data_test,
        feature_definitions=feature_definitions
    )
    
    print("\nMaking predictions...")
    predictions = featuristic_classifier.predict(features_test)
    predictions = np.array(predictions)
    
    return predictions

## Train and Evaluate the Model

In [None]:
# Run the training and testing process
predictions = await train_and_test()

# Calculate and display results
correct = np.sum(predictions == y_test)
total = len(y_test)
accuracy = correct/total

print(f"\nResults:")
print(f"Correct predictions: {correct}/{total}")
print(f"Accuracy: {accuracy:.2f}")