In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import uuid
print("Numerical columns dtypes:")
print(data[numerical_cols].dtypes)


# Load the dataset
data = pd.read_excel('D:\Desktop\Iteration_1\dataset\england_dataset.xlsx')

# Define features and target
target = 'Life expectancy [Pe3]'
# 修改列定义部分
categorical_cols = ['Area Name', 'Area Type [Note 3]', 'Area Code']  # 添加实际存在的分类列
numerical_cols = [col for col in data.columns 
                 if col not in categorical_cols + [target]
                 and np.issubdtype(data[col].dtype, np.number)]  # 确保只选择数值型列


# Preprocessing pipeline
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define models
models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store results
results = {name: {'train_rmse': [], 'test_rmse': [], 'train_r2': [], 'test_r2': []} for name in models}

# Perform 5-fold cross-validation
X = data.drop(columns=[target])
y = data[target]

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    for model_name, model in models.items():
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        # Fit model
        pipeline.fit(X_train, y_train)
        
        # Predictions
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Store results
        results[model_name]['train_rmse'].append(train_rmse)
        results[model_name]['test_rmse'].append(test_rmse)
        results[model_name]['train_r2'].append(train_r2)
        results[model_name]['test_r2'].append(test_r2)

# Aggregate results
summary = {}
for model_name in models:
    summary[model_name] = {
        'Mean Train RMSE': np.mean(results[model_name]['train_rmse']),
        'Mean Test RMSE': np.mean(results[model_name]['test_rmse']),
        'Mean Train R²': np.mean(results[model_name]['train_r2']),
        'Mean Test R²': np.mean(results[model_name]['test_r2'])
    }

# Print summary
print("Model Performance Summary:")
for model_name, metrics in summary.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Plotting
fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=True)
fig.suptitle('Model Performance Comparison', fontsize=16)

# RMSE Plot
for i, model_name in enumerate(models):
    ax = axes[0, i//2]
    folds = range(1, 6)
    ax.bar([x - 0.2 for x in folds], results[model_name]['train_rmse'], width=0.4, label='Train RMSE')
    ax.bar([x + 0.2 for x in folds], results[model_name]['test_rmse'], width=0.4, label='Test RMSE')
    ax.set_title(f'{model_name} RMSE')
    ax.set_xlabel('Fold')
    ax.set_ylabel('RMSE')
    ax.legend()

# R² Plot
for i, model_name in enumerate(models):
    ax = axes[1, i//2]
    folds = range(1, 6)
    ax.bar([x - 0.2 for x in folds], results[model_name]['train_r2'], width=0.4, label='Train R²')
    ax.bar([x + 0.2 for x in folds], results[model_name]['test_r2'], width=0.4, label='Test R²')
    ax.set_title(f'{model_name} R²')
    ax.set_xlabel('Fold')
    ax.set_ylabel('R²')
    ax.legend()

plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

# Summary Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
model_names = list(models.keys())
mean_test_rmse = [summary[name]['Mean Test RMSE'] for name in model_names]
mean_test_r2 = [summary[name]['Mean Test R²'] for name in model_names]

ax1.bar(model_names, mean_test_rmse, color='skyblue')
ax1.set_title('Mean Test RMSE Comparison')
ax1.set_ylabel('Mean Test RMSE')
ax1.tick_params(axis='x', rotation=45)

ax2.bar(model_names, mean_test_r2, color='lightgreen')
ax2.set_title('Mean Test R² Comparison')
ax2.set_ylabel('Mean Test R²')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('summary_comparison.png')
plt.close()

Numerical columns dtypes:
Area Code                           object
Healthy People Domain              float64
Difficulties in daily life [Pe]    float64
Disability [Pe1]                   float64
Frailty [Pe1]                      float64
                                    ...   
Household overcrowding [Pl5]       float64
Noise complaints [Pl5]             float64
Road safety [Pl5]                  float64
Rough sleeping [Pl5]               float64
index health                       float64
Length: 74, dtype: object
Model Performance Summary:

Gradient Boosting:
Mean Train RMSE: 0.8412
Mean Test RMSE: 1.1590
Mean Train R²: 0.9931
Mean Test R²: 0.9867

Random Forest:
Mean Train RMSE: 0.4581
Mean Test RMSE: 1.2161
Mean Train R²: 0.9979
Mean Test R²: 0.9854

XGBoost:
Mean Train RMSE: 0.0931
Mean Test RMSE: 1.1375
Mean Train R²: 0.9999
Mean Test R²: 0.9872

LightGBM:
Mean Train RMSE: 0.3471
Mean Test RMSE: 1.0688
Mean Train R²: 0.9988
Mean Test R²: 0.9888


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import re

# Load dataset
data = pd.read_excel('D:\\Desktop\\Iteration_1\\dataset\\england_dataset.xlsx')

# Data preprocessing
# Drop non-numerical or identifier columns
X = data.drop(columns=['Area Code', 'Area Name', 'Area Type [Note 3]', 'index health'])
y = data['index health']

# Clean feature column names: keep only letters, digits, and underscores
X.columns = [re.sub(r'[^a-zA-Z0-9]', '_', col) for col in X.columns]
# Ensure column names start with a letter (required by LightGBM)
X.columns = ['f_' + col if col[0].isdigit() else col for col in X.columns]

# Print cleaned column names for debugging
print("Cleaned feature column names:")
print(X.columns.tolist())

# Handle missing values (fill with 0 for simplicity)
X = X.fillna(0)

# Validate sample count
print(f"\nTotal number of samples in dataset: {len(data)}")
if len(data) != 2387:
    raise ValueError("The dataset should contain 2387 samples. Please check your file!")

# Define manual 5-fold split (477, 477, 477, 477, 479)
fold_sizes = [477, 477, 477, 477, 479]
fold_indices = []
start_idx = 0
for fold_size in fold_sizes:
    fold_indices.append(range(start_idx, start_idx + fold_size))
    start_idx += fold_size

# Define models
models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1)
}

# Initialize results storage
results = {model_name: {'train_rmse': [], 'train_r2': [], 'test_rmse': [], 'test_r2': []} for model_name in models}

# Perform manual 5-fold cross-validation
for fold_idx, test_indices in enumerate(fold_indices, 1):
    print(f"\nFold {fold_idx}")
    
    test_idx = list(test_indices)
    train_idx = [i for i in range(len(data)) if i not in test_idx]
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    print(f"  Training samples: {len(X_train)}, Test samples: {len(X_test)}")
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        train_r2 = r2_score(y_train, y_train_pred)
        
        y_test_pred = model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        test_r2 = r2_score(y_test, y_test_pred)
        
        results[model_name]['train_rmse'].append(train_rmse)
        results[model_name]['train_r2'].append(train_r2)
        results[model_name]['test_rmse'].append(test_rmse)
        results[model_name]['test_r2'].append(test_r2)
        
        print(f"{model_name} - Fold {fold_idx}:")
        print(f"  Train RMSE: {train_rmse:.4f}, Train R²: {train_r2:.4f}")
        print(f"  Test RMSE: {test_rmse:.4f}, Test R²: {test_r2:.4f}")

# Compute average results
avg_results = {model_name: {} for model_name in models}
for model_name in models:
    avg_results[model_name]['avg_train_rmse'] = np.mean(results[model_name]['train_rmse'])
    avg_results[model_name]['avg_train_r2'] = np.mean(results[model_name]['train_r2'])
    avg_results[model_name]['avg_test_rmse'] = np.mean(results[model_name]['test_rmse'])
    avg_results[model_name]['avg_test_r2'] = np.mean(results[model_name]['test_r2'])

# Print average results
print("\nAverage Results for Each Model:")
for model_name in models:
    print(f"\n{model_name}:")
    print(f"  Avg Train RMSE: {avg_results[model_name]['avg_train_rmse']:.4f}")
    print(f"  Avg Train R²: {avg_results[model_name]['avg_train_r2']:.4f}")
    print(f"  Avg Test RMSE: {avg_results[model_name]['avg_test_rmse']:.4f}")
    print(f"  Avg Test R²: {avg_results[model_name]['avg_test_r2']:.4f}")

# Plot RMSE comparison
plt.figure(figsize=(12, 6))
x = np.arange(len(models))
width = 0.2

train_rmse_means = [avg_results[model]['avg_train_rmse'] for model in models]
plt.bar(x - width, train_rmse_means, width, label='Avg Train RMSE', color='skyblue')

test_rmse_means = [avg_results[model]['avg_test_rmse'] for model in models]
plt.bar(x, test_rmse_means, width, label='Avg Test RMSE', color='salmon')

plt.xlabel('Models')
plt.ylabel('RMSE')
plt.title('Average RMSE Comparison Across Models')
plt.xticks(x, models.keys())
plt.legend()
plt.tight_layout()
plt.savefig('rmse_comparison.png')
plt.close()

# Plot R² comparison
plt.figure(figsize=(12, 6))

train_r2_means = [avg_results[model]['avg_train_r2'] for model in models]
plt.bar(x - width, train_r2_means, width, label='Avg Train R²', color='lightgreen')

test_r2_means = [avg_results[model]['avg_test_r2'] for model in models]
plt.bar(x, test_r2_means, width, label='Avg Test R²', color='orange')

plt.xlabel('Models')
plt.ylabel('R²')
plt.title('Average R² Comparison Across Models')
plt.xticks(x, models.keys())
plt.legend()
plt.tight_layout()
plt.savefig('r2_comparison.png')
plt.close()

# Plot RMSE per fold for each model
for model_name in models:
    plt.figure(figsize=(10, 5))
    folds = range(1, 6)
    plt.plot(folds, results[model_name]['train_rmse'], marker='o', label='Train RMSE', color='blue')
    plt.plot(folds, results[model_name]['test_rmse'], marker='o', label='Test RMSE', color='red')
    plt.xlabel('Fold')
    plt.ylabel('RMSE')
    plt.title(f'{model_name} RMSE per Fold')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_rmse_folds.png')
    plt.close()

# Plot R² per fold for each model
for model_name in models:
    plt.figure(figsize=(10, 5))
    folds = range(1, 6)
    plt.plot(folds, results[model_name]['train_r2'], marker='o', label='Train R²', color='green')
    plt.plot(folds, results[model_name]['test_r2'], marker='o', label='Test R²', color='orange')
    plt.xlabel('Fold')
    plt.ylabel('R²')
    plt.title(f'{model_name} R² per Fold')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_r2_folds.png')
    plt.close()

print("\nGenerated chart files:")
print("- rmse_comparison.png")
print("- r2_comparison.png")
for model_name in models:
    print(f"- {model_name.lower().replace(' ', '_')}_rmse_folds.png")
    print(f"- {model_name.lower().replace(' ', '_')}_r2_folds.png")


Cleaned feature column names:
['Healthy_People_Domain', 'Difficulties_in_daily_life__Pe_', 'Disability__Pe1_', 'Frailty__Pe1_', 'Mental_health__Pe_', 'Children_s_social__emotional_and_mental_health__Pe2_', 'Mental_health_conditions__Pe2_', 'Self_harm__Pe2_', 'Suicides__Pe2_', 'Mortality__Pe_', 'Avoidable_mortality__Pe3_', 'Infant_mortality__Pe3_', 'Life_expectancy__Pe3_', 'Mortality_from_all_causes__Pe3_', 'Personal_well_being__Pe_', 'Activities_in_life_are_worthwhile__Pe4_', 'Feelings_of_anxiety__Pe4_', 'Happiness__Pe4_', 'Life_satisfaction__Pe4_', 'Physical_health_conditions__Pe_', 'Cancer__Pe5_', 'Cardiovascular_conditions__Pe5_', 'Dementia__Pe5_', 'Diabetes__Pe5_', 'Kidney_and_liver_disease__Pe5_', 'Musculoskeletal_conditions__Pe5_', 'Respiratory_conditions__Pe5_', 'Healthy_Lives_Domain', 'Behavioural_risk_factors__L_', 'Alcohol_misuse__L1_', 'Drug_misuse__L1_', 'Healthy_eating__L1_', 'Physical_activity__L1_', 'Sedentary_behaviour__L1_', 'Sexually_transmitted_infections__L1_', 'Smo