# Machine Learning Analysis

## Cancer Incidence Data Analysis

This notebook covers ML models for predicting incidence rates.

In [None]:
# Import libraries
import sys
sys.path.append('../../scripts/python')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import load_data, clean_data
from ml_models import (prepare_ml_data, train_and_evaluate_models, 
                      plot_model_comparison, feature_importance_analysis)

import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

In [None]:
# Prepare data
df = load_data('../../data/incd.csv')
df_clean = clean_data(df)
X_train, X_test, y_train, y_test, scaler, feature_cols = prepare_ml_data(df_clean)
print(f"Training: {X_train.shape}, Test: {X_test.shape}")

In [None]:
# Train models
results = train_and_evaluate_models(X_train, X_test, y_train, y_test)

In [None]:
# Compare models
plot_model_comparison(results)
best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
best_model = results[best_model_name]['model']
print(f"Best Model: {best_model_name}")
feature_importance_analysis(best_model, feature_cols)