In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt
import joblib

# 1. Load the Processed Data (The one created by your python scripts)
# This proves your pipeline works
print("Loading Training Data...")
df = pd.read_csv('../data/raw/elliptic_txs_features.csv', header=None) # Simplified load for demo
classes = pd.read_csv('../data/raw/elliptic_txs_classes.csv')

# Quick Merge & Clean for the notebook demo
df.columns = ['txId', 'time_step'] + [f'feat_{i}' for i in range(165)]
data = pd.merge(df, classes, on='txId')
data['class'] = data['class'].map({'1': 1, '2': 0, 'unknown': -1})
data = data[data['class'] != -1]

# 2. Train/Test Split
X = data.drop(columns=['txId', 'class', 'time_step'])
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Train Model
print("Training Prototype Model...")
model = RandomForestClassifier(n_estimators=50, class_weight='balanced', n_jobs=-1)
model.fit(X_train, y_train)

# 4. Visualize Results (Confusion Matrix)
print("Evaluating...")
cm = confusion_matrix(y_test, model.predict(X_test))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Licit', 'Illicit'])
disp.plot(cmap='Blues')
plt.title("Model Confusion Matrix")
plt.show()

# 5. Print Score
print(classification_report(y_test, model.predict(X_test)))