In [1]:
import pandas as pd
from scipy import stats

In [2]:
data = pd.read_csv("../treated_data.csv")
target_col = 'alanine_aminotransferase'

print(data[target_col].describe())

count    10000.000000
mean        26.985214
std          4.900837
min          9.608100
25%         23.630650
50%         26.946000
75%         30.297500
max         43.548600
Name: alanine_aminotransferase, dtype: float64


In [3]:
target_col = 'alanine_aminotransferase'
exclude_from_analysis = ['chronic_obstructive_pulmonary_disease', 'alanine_aminotransferase']

feature_map = {
    'Continuous': ['age', 'bmi', 'medication_count', 'days_hospitalized', 'last_lab_glucose', 'albumin_globulin_ratio'],
    'Binary': ['sex', 'smoker', 'readmitted', 'urban'],
    'Ordinal': ['exercise_frequency', 'diet_quality', 'income_bracket', 'education_level'],
    'Nominal': ['diagnosis_code']
}

In [4]:
alt_results = []

for category, features in feature_map.items():
    valid_features = [f for f in features if f in data.columns and f not in exclude_from_analysis]
    
    if not valid_features:
        continue

    print(f"\n--- {category} Features vs {target_col} ---")
    
    for feature in valid_features:
        clean_df = data[[feature, target_col]].dropna()
        
        if category == 'Continuous':
            stat_val, p_value = stats.pearsonr(clean_df[feature], clean_df[target_col])
            test_type = 'Pearson r'
        else:
            groups = [group[target_col] for name, group in clean_df.groupby(feature)]
            stat_val, p_value = stats.f_oneway(*groups)
            test_type = 'ANOVA F'

        significant = p_value < 0.05
        alt_results.append({
            'Feature': feature,
            'Type': category,
            'Test': test_type,
            'Stat_Value': stat_val,
            'p-value': p_value,
            'Significant': 'Y' if significant else 'N'
        })
        
        print(f"  {feature:25s} | {test_type}: {stat_val:8.4f} | p: {p_value:.4f} | Sig: {alt_results[-1]['Significant']}")

alt_analysis_df = pd.DataFrame(alt_results)


--- Continuous Features vs alanine_aminotransferase ---
  age                       | Pearson r:  -0.0179 | p: 0.0736 | Sig: N
  bmi                       | Pearson r:   0.9998 | p: 0.0000 | Sig: Y
  medication_count          | Pearson r:   0.0030 | p: 0.7613 | Sig: N
  days_hospitalized         | Pearson r:   0.0064 | p: 0.5209 | Sig: N
  last_lab_glucose          | Pearson r:  -0.0059 | p: 0.5560 | Sig: N
  albumin_globulin_ratio    | Pearson r:   0.0046 | p: 0.6462 | Sig: N

--- Binary Features vs alanine_aminotransferase ---
  sex                       | ANOVA F:   0.0058 | p: 0.9392 | Sig: N
  smoker                    | ANOVA F:   0.0692 | p: 0.7925 | Sig: N
  readmitted                | ANOVA F:   6.6827 | p: 0.0097 | Sig: Y
  urban                     | ANOVA F:   0.0048 | p: 0.9450 | Sig: N

--- Ordinal Features vs alanine_aminotransferase ---
  exercise_frequency        | ANOVA F:   0.2767 | p: 0.7583 | Sig: N
  diet_quality              | ANOVA F:   0.1044 | p: 0.9009 | Sig

In [5]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import pickle

In [6]:
X = data.drop(['alanine_aminotransferase', 'chronic_obstructive_pulmonary_disease'], axis=1)
y = data['alanine_aminotransferase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train, y_train)

cv_results = cross_validate(
    model, X, y,
    cv=5,
    scoring={
        'r2': 'r2',
        'rmse': 'neg_root_mean_squared_error',
        'mae': 'neg_mean_absolute_error'
    },
    return_train_score=False,
    n_jobs=-1
)

In [7]:
print("\nCross-Validation Results (5-Fold):")
print(f"  R²:   {cv_results['test_r2'].mean():.4f} ± {cv_results['test_r2'].std():.4f}")
print(f"  RMSE: {-cv_results['test_rmse'].mean():.4f} ± {cv_results['test_rmse'].std():.4f}")
print(f"  MAE:  {-cv_results['test_mae'].mean():.4f} ± {cv_results['test_mae'].std():.4f}")


Cross-Validation Results (5-Fold):
  R²:   0.9990 ± 0.0001
  RMSE: 0.1513 ± 0.0112
  MAE:  0.0956 ± 0.0021


In [8]:
#test results
y_pred = model.predict(X_test)

test_r2   = r2_score(y_test, y_pred)
test_rmse = root_mean_squared_error(y_test, y_pred)
test_mae  = mean_absolute_error(y_test, y_pred)

print("\nTest Set Results:")
print(f"  R²:   {test_r2:.4f}")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE:  {test_mae:.4f}")


Test Set Results:
  R²:   0.9992
  RMSE: 0.1402
  MAE:  0.0955


In [9]:
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(importance_df.to_string(index=False))


Feature Importance:
               Feature  Importance
                   bmi    0.994655
        diagnosis_code    0.001030
          diet_quality    0.000794
        income_bracket    0.000576
      medication_count    0.000445
                 urban    0.000409
      last_lab_glucose    0.000373
            readmitted    0.000347
                   age    0.000319
albumin_globulin_ratio    0.000314
       education_level    0.000270
     days_hospitalized    0.000215
    exercise_frequency    0.000105
                   sex    0.000079
                smoker    0.000068


In [10]:
model.save_model('../../model_alt.json')

Our results demonstrates that BMI has almost a perfect correlation with "alanine_aminotransferase" which is a strong indicative of synthetic data. In which we could get a function like "alanine_aminotransferase" = "BMI" + noise.