# Machine Learning Analysis

## Cancer Incidence Data Analysis (R)

This notebook covers ML models for predicting incidence rates.

In [None]:
# Load libraries
library(caret)
library(randomForest)
library(xgboost)
library(glmnet)
library(Metrics)

source('../../scripts/r/data_loader.R')
source('../../scripts/r/ml_models.R')

In [None]:
# Prepare data
df <- load_data('../../data/incd.csv')
df_clean <- clean_data(df)
ml_data <- prepare_ml_data(df_clean)
cat('Training:', nrow(ml_data$X_train), 'Test:', nrow(ml_data$X_test), '\n')

In [None]:
# Train and evaluate models
results_list <- train_and_evaluate_models(
  ml_data$X_train, ml_data$X_test,
  ml_data$y_train, ml_data$y_test
)

In [None]:
# Compare models and analyze importance
plot_model_comparison(results_list$results)
best_model_name <- results_list$results$Model[which.max(results_list$results$Test_R2)]
best_model <- results_list$models[[best_model_name]]
cat('Best Model:', best_model_name, '\n')
feature_importance_analysis(best_model, ml_data$feature_cols)