# Model Training - Random Forest Keyword Classifier

This notebook trains and evaluates a Random Forest model for keyword relevance classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import sys
sys.path.append('../..')

sns.set_style('whitegrid')

## Load Training Data

In [None]:
import os
os.chdir('../..')

# Load feedback data with deduplication
from ml.train import load_feedback_data

X, y, stats = load_feedback_data()

print(f"Data preprocessing:")
print(f"  Original samples: {stats['original_samples']}")
print(f"  Duplicates removed: {stats['duplicates_removed']}")
print(f"  Conflicts dropped: {stats['conflicts_dropped']}")
print(f"  Clean samples: {stats['final_samples']}")

print(f"\nApproval rate: {y.mean():.2%}")
print(f"Feature shape: {X.shape}")
print(f"Label distribution: {np.bincount(y)}")

## Train Baseline Model

In [None]:
# Train Random Forest with default parameters from config
from ml.config import MODEL_PARAMS

model = RandomForestClassifier(**MODEL_PARAMS)
model.fit(X, y)

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=min(5, len(X)), scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

## Feature Importance

In [None]:
# Plot feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print(importance_df)

## Hyperparameter Tuning

Testing different RandomForest configurations to find optimal parameters.

In [None]:
# Hyperparameter tuning with GridSearchCV
print("Running hyperparameter tuning (this may take a few minutes)...")

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=min(5, len(X)),
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.2%}")
print(f"\nComparison with default params:")
print(f"  Default (from config): {cv_scores.mean():.2%}")
print(f"  Tuned: {grid_search.best_score_:.2%}")
print(f"  Improvement: {(grid_search.best_score_ - cv_scores.mean()):.2%}")

# Use best model
model = grid_search.best_estimator_

## Hyperparameter Tuning Results

Visualizing how different parameters affect model performance.

In [None]:
# Visualize grid search results
results_df = pd.DataFrame(grid_search.cv_results_)

# Plot: n_estimators vs score
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. n_estimators effect
ax = axes[0, 0]
for depth in [5, 10, 15, 20, None]:
    mask = results_df['param_max_depth'] == depth
    subset = results_df[mask].groupby('param_n_estimators')['mean_test_score'].mean()
    ax.plot(subset.index, subset.values, marker='o', label=f'max_depth={depth}')
ax.set_xlabel('n_estimators')
ax.set_ylabel('CV Accuracy')
ax.set_title('Effect of n_estimators')
ax.legend()
ax.grid(alpha=0.3)

# 2. max_depth effect
ax = axes[0, 1]
depth_scores = results_df.groupby('param_max_depth')['mean_test_score'].agg(['mean', 'std'])
ax.bar(range(len(depth_scores)), depth_scores['mean'], yerr=depth_scores['std'])
ax.set_xticks(range(len(depth_scores)))
ax.set_xticklabels([str(x) for x in depth_scores.index])
ax.set_xlabel('max_depth')
ax.set_ylabel('CV Accuracy')
ax.set_title('Effect of max_depth')
ax.grid(alpha=0.3, axis='y')

# 3. min_samples_split effect
ax = axes[1, 0]
split_scores = results_df.groupby('param_min_samples_split')['mean_test_score'].agg(['mean', 'std'])
ax.bar(range(len(split_scores)), split_scores['mean'], yerr=split_scores['std'])
ax.set_xticks(range(len(split_scores)))
ax.set_xticklabels(split_scores.index)
ax.set_xlabel('min_samples_split')
ax.set_ylabel('CV Accuracy')
ax.set_title('Effect of min_samples_split')
ax.grid(alpha=0.3, axis='y')

# 4. min_samples_leaf effect
ax = axes[1, 1]
leaf_scores = results_df.groupby('param_min_samples_leaf')['mean_test_score'].agg(['mean', 'std'])
ax.bar(range(len(leaf_scores)), leaf_scores['mean'], yerr=leaf_scores['std'])
ax.set_xticks(range(len(leaf_scores)))
ax.set_xticklabels(leaf_scores.index)
ax.set_xlabel('min_samples_leaf')
ax.set_ylabel('CV Accuracy')
ax.set_title('Effect of min_samples_leaf')
ax.grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Top 10 configurations
print("\nTop 10 Parameter Combinations:")
print("="*80)
top_10 = results_df.nsmallest(10, 'rank_test_score')[
    ['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 
     'param_min_samples_leaf', 'mean_test_score', 'std_test_score']
]
for idx, row in top_10.iterrows():
    print(f"\n{row['mean_test_score']:.2%} (±{row['std_test_score']:.2%})")
    print(f"  n_estimators={row['param_n_estimators']}, max_depth={row['param_max_depth']}, "
          f"min_samples_split={row['param_min_samples_split']}, min_samples_leaf={row['param_min_samples_leaf']}")

## Save Model

In [None]:
# Save trained model
from datetime import datetime
import os

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f'../models/rf_model_{timestamp}.pkl'
os.makedirs('../models', exist_ok=True)
joblib.dump(model, model_path)

print(f"Model saved to: {model_path}")
print(f"Number of samples: {len(X)}")
print(f"CV Accuracy: {cv_scores.mean():.3f}")

## Model Summary

In [None]:
# Create training summary
summary = {
    'timestamp': timestamp,
    'num_samples': len(X),
    'num_features': X.shape[1],
    'cv_accuracy': grid_search.best_score_,
    'cv_std': results_df.loc[grid_search.best_index_, 'std_test_score'],
    'best_params': grid_search.best_params_,
    'improvement_over_default': grid_search.best_score_ - cv_scores.mean(),
    'top_features': importance_df.head(3)['feature'].tolist()
}

print("\n=== Training Summary ===")
for key, value in summary.items():
    print(f"{key}: {value}")

print("\n" + "="*80)
print("RECOMMENDED CONFIG UPDATE:")
print("="*80)
print("\nUpdate ml/config.py MODEL_PARAMS with:")
print(f"""
MODEL_PARAMS = {{
    'n_estimators': {grid_search.best_params_['n_estimators']},
    'max_depth': {grid_search.best_params_['max_depth']},
    'min_samples_split': {grid_search.best_params_['min_samples_split']},
    'min_samples_leaf': {grid_search.best_params_['min_samples_leaf']},
    'random_state': 42
}}
""")