In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pickle
import os

# ========== CONFIG ==========
data_path = "/home/liorkob/M.Sc/thesis/pre-train/punishment_prediction/merged_output.csv"
model_save_path = "./baseline_punishment_model"
test_size = 0.2
random_state = 42

# TF-IDF parameters
max_features = 10000
min_df = 2
max_df = 0.95
ngram_range = (1, 2)  # unigrams and bigrams

print("🔧 Loading and preprocessing data...")

# ========== LOAD DATA ==========
df = pd.read_csv(data_path).dropna(subset=["extracted_gpt_facts", "low", "high"])

# Convert targets to numeric
df["low"] = pd.to_numeric(df["low"], errors='coerce')
df["high"] = pd.to_numeric(df["high"], errors='coerce')

# Remove rows where conversion failed
df = df.dropna(subset=["low", "high"])

print(f"📊 Dataset shape: {df.shape}")
print(f"📈 Low range: {df['low'].min():.1f} - {df['low'].max():.1f}")
print(f"📈 High range: {df['high'].min():.1f} - {df['high'].max():.1f}")

# ========== PREPARE FEATURES AND TARGETS ==========
X_text = df["extracted_gpt_facts"].astype(str)
y = df[["low", "high"]].values

# Split the data
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=test_size, random_state=random_state
)

print(f"🔄 Training set size: {len(X_train_text)}")
print(f"🔄 Test set size: {len(X_test_text)}")

# ========== TF-IDF VECTORIZATION ==========
print("🔤 Creating TF-IDF features...")

tfidf = TfidfVectorizer(
    max_features=max_features,
    min_df=min_df,
    max_df=max_df,
    ngram_range=ngram_range,
    stop_words='english',
    lowercase=True,
    strip_accents='unicode'
)

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

print(f"📐 TF-IDF feature shape: {X_train_tfidf.shape}")

# ========== MODEL TRAINING ==========
print("🤖 Training baseline models...")

# Create save directory
os.makedirs(model_save_path, exist_ok=True)

# Approach 1: Multi-output Logistic Regression (treating as regression)
print("  📊 Training Multi-output Logistic Regression...")
logistic_model = MultiOutputRegressor(
    LogisticRegression(max_iter=1000, random_state=random_state)
)

# Scale targets for better logistic regression performance
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

try:
    logistic_model.fit(X_train_tfidf, y_train_scaled)
    logistic_pred_scaled = logistic_model.predict(X_test_tfidf)
    logistic_pred = scaler_y.inverse_transform(logistic_pred_scaled)
    logistic_success = True
except Exception as e:
    print(f"  ⚠️ Logistic regression failed: {e}")
    logistic_success = False

# Approach 2: Random Forest (usually more robust for this type of problem)
print("  🌲 Training Random Forest Regressor...")
rf_model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, random_state=random_state, n_jobs=-1)
)

rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)

# ========== EVALUATION ==========
def evaluate_predictions(y_true, y_pred, model_name):
    print(f"\n📊 {model_name} Results:")
    
    # Overall metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    print(f"  📈 MAE: {mae:.3f}")
    print(f"  📈 RMSE: {rmse:.3f}")
    print(f"  📈 R²: {r2:.3f}")
    
    # Per-target metrics
    for i, target in enumerate(['low', 'high']):
        target_mae = mean_absolute_error(y_true[:, i], y_pred[:, i])
        target_r2 = r2_score(y_true[:, i], y_pred[:, i])
        print(f"  📊 {target.upper()} - MAE: {target_mae:.3f}, R²: {target_r2:.3f}")
    
    return mae, rmse, r2

# Evaluate models
print("🎯 Evaluating models...")

# Random baseline (for comparison)
random_pred = np.random.uniform(
    low=[y_train[:, 0].min(), y_train[:, 1].min()],
    high=[y_train[:, 0].max(), y_train[:, 1].max()],
    size=y_test.shape
)
random_mae, random_rmse, random_r2 = evaluate_predictions(y_test, random_pred, "Random Baseline")

# Random Forest
rf_mae, rf_rmse, rf_r2 = evaluate_predictions(y_test, rf_pred, "Random Forest")

# Logistic Regression (if successful)
if logistic_success:
    log_mae, log_rmse, log_r2 = evaluate_predictions(y_test, logistic_pred, "Logistic Regression")

# ========== SAMPLE PREDICTIONS ==========
print(f"\n🔍 Sample Predictions (first 10 test cases):")
print(f"{'True Low':<10} {'True High':<10} {'RF Low':<10} {'RF High':<10}", end="")
if logistic_success:
    print(f" {'LR Low':<10} {'LR High':<10}")
else:
    print()

for i in range(min(10, len(y_test))):
    print(f"{y_test[i,0]:<10.1f} {y_test[i,1]:<10.1f} {rf_pred[i,0]:<10.1f} {rf_pred[i,1]:<10.1f}", end="")
    if logistic_success:
        print(f" {logistic_pred[i,0]:<10.1f} {logistic_pred[i,1]:<10.1f}")
    else:
        print()

# ========== SAVE MODELS ==========
print(f"\n💾 Saving models to {model_save_path}...")

# Save the best performing model and components
with open(f"{model_save_path}/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open(f"{model_save_path}/random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

if logistic_success:
    with open(f"{model_save_path}/logistic_model.pkl", "wb") as f:
        pickle.dump(logistic_model, f)
    with open(f"{model_save_path}/target_scaler.pkl", "wb") as f:
        pickle.dump(scaler_y, f)

# Save results summary
results = {
    'random_baseline': {'mae': random_mae, 'rmse': random_rmse, 'r2': random_r2},
    'random_forest': {'mae': rf_mae, 'rmse': rf_rmse, 'r2': rf_r2}
}

if logistic_success:
    results['logistic_regression'] = {'mae': log_mae, 'rmse': log_rmse, 'r2': log_r2}

with open(f"{model_save_path}/results.pkl", "wb") as f:
    pickle.dump(results, f)

print("✅ Models and results saved successfully!")

# ========== FEATURE IMPORTANCE ==========
print(f"\n🔍 Top 20 Important Features (Random Forest):")
feature_names = tfidf.get_feature_names_out()

# Get feature importance for both outputs
importance_low = rf_model.estimators_[0].feature_importances_
importance_high = rf_model.estimators_[1].feature_importances_

# Combined importance
combined_importance = (importance_low + importance_high) / 2
top_indices = np.argsort(combined_importance)[-20:][::-1]

for i, idx in enumerate(top_indices):
    print(f"  {i+1:2d}. {feature_names[idx]:<20} (importance: {combined_importance[idx]:.4f})")

# ========== SUMMARY ==========
print(f"\n📋 SUMMARY:")
print(f"🎯 Best model: {'Random Forest' if rf_r2 > (log_r2 if logistic_success else -1) else 'Logistic Regression'}")
print(f"📊 Improvement over random: {((rf_mae - random_mae) / random_mae * 100):.1f}% MAE reduction")
print(f"🔤 TF-IDF features: {X_train_tfidf.shape[1]:,}")
print(f"📈 Best R²: {max(rf_r2, log_r2 if logistic_success else rf_r2):.3f}")

if rf_r2 > 0.1:  # Arbitrary threshold for "decent" performance
    print("✅ Model shows promise - significantly better than random!")
else:
    print("⚠️ Model performance is limited - may need feature engineering or different approach")

🔧 Loading and preprocessing data...
📊 Dataset shape: (2502, 7)
📈 Low range: 0.0 - 144.0
📈 High range: 2.0 - 192.0
🔄 Training set size: 2001
🔄 Test set size: 501
🔤 Creating TF-IDF features...
📐 TF-IDF feature shape: (2001, 10000)
🤖 Training baseline models...
  📊 Training Multi-output Logistic Regression...
  ⚠️ Logistic regression failed: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
  🌲 Training Random Forest Regressor...
🎯 Evaluating models...

📊 Random Baseline Results:
  📈 MAE: 68.516
  📈 RMSE: 83.149
  📈 R²: -24.729
  📊 LOW - MAE: 62.024, R²: -31.220
  📊 HIGH - MAE: 75.009, R²: -18.237

📊 Random Forest Results:
  📈 MAE: 9.135
  📈 RMSE: 13.298
  📈 R²: 0.411
  📊 LOW - MAE: 7.052, R²: 0.410
  📊 HIGH - MAE: 11.219, R²: 0.412

🔍 Sample Predictions (first 10 test cases):
True Low   True High  RF Low     RF High   
6.0        18.0       7.9        25.5      
24.0       48.0       12.