In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

In [3]:
data = pd.read_csv('cardio_dataset-original.csv')

In [5]:
factor = 4
upper_lim = data['AGEIR'].mean() + data['AGEIR'].std() * factor
lower_lim = data['AGEIR'].mean() - data['AGEIR'].std() * factor
data_new = data[(data['AGEIR'] < upper_lim) & (data['AGEIR'] > lower_lim)]

In [7]:
upper_lim = data_new['TC'].mean() + data_new['TC'].std() * factor
lower_lim = data_new['TC'].mean() - data_new['TC'].std() * factor
data_new = data_new[(data_new['TC'] < upper_lim) & (data_new['TC'] > lower_lim)]

In [9]:
upper_lim = data_new['HDL'].mean() + data_new['HDL'].std() * factor
lower_lim = data_new['HDL'].mean() - data_new['HDL'].std() * factor
data_new = data_new[(data_new['HDL'] < upper_lim) & (data_new['HDL'] > lower_lim)]

In [11]:
upper_lim = data_new['RISK'].mean() + data_new['RISK'].std() * factor
lower_lim = data_new['RISK'].mean() - data_new['RISK'].std() * factor
data_new = data_new[(data_new['RISK'] < upper_lim) & (data_new['RISK'] > lower_lim)]


## Convert to category and encode


In [14]:
data_new["SEX"] = data_new["SEX"].astype('category')
data_new["SMOKE_"] = data_new["SMOKE_"].astype('category')
data_new["BPMED"] = data_new["BPMED"].astype('category')
data_new["DIAB_noyes"] = data_new["DIAB_noyes"].astype('category')

In [16]:
data_new["SEX"] = data_new["SEX"].cat.codes
data_new["SMOKE_"] = data_new["SMOKE_"].cat.codes
data_new["BPMED"] = data_new["BPMED"].cat.codes
data_new["DIAB_noyes"] = data_new["DIAB_noyes"].cat.codes

#### Prepare data 

In [19]:
dataset = data_new.values 
data = dataset[:, :7]
target = dataset[:, 7]

In [21]:
from sklearn.preprocessing import QuantileTransformer
model_qntl_data = QuantileTransformer(output_distribution='normal', random_state=0)
data_scaled = model_qntl_data.fit_transform(data)

In [23]:
model_qntl_target = QuantileTransformer(output_distribution='normal', random_state=0)
target_scaled = model_qntl_target.fit_transform(target.reshape(-1, 1))
target_scaled = target_scaled.ravel()  # Flatten to 1D array

## Polynomial Features

In [26]:
from sklearn.preprocessing import PolynomialFeatures
model_poly = PolynomialFeatures(degree=3, include_bias=False)
data_high = model_poly.fit_transform(data_scaled)

In [28]:
train_data, test_data, train_target, test_target = train_test_split(
    data_high, target_scaled, test_size=0.1, random_state=42
)

### Random Forest Regressor


In [31]:
rf_model = RandomForestRegressor(
    n_estimators=100,        # Number of trees
    max_depth=None,          # Maximum depth of trees
    min_samples_split=2,     # Minimum samples to split a node
    min_samples_leaf=1,      # Minimum samples in leaf node
    random_state=42,         # For reproducibility
    n_jobs=-1                # Use all available cores
)

In [33]:
print("Training Random Forest model...")
rf_model.fit(train_data, train_target)

Training Random Forest model...


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
predicted_target = rf_model.predict(test_data)

In [37]:
r2 = r2_score(test_target, predicted_target)
mse = mean_squared_error(test_target, predicted_target)
mae = mean_absolute_error(test_target, predicted_target)

In [39]:
print(f"\nRandom Forest Results:")
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")


Random Forest Results:
R² Score: 0.8789
MSE: 0.1135
MAE: 0.2334


In [41]:
joblib.dump(rf_model, 'heart_risk_random_forest.sav')
joblib.dump(model_poly, 'model_poly.sav')
joblib.dump(model_qntl_data, 'model_qntl_data.sav')
joblib.dump(model_qntl_target, 'model_qntl_target.sav')

['model_qntl_target.sav']

In [43]:
print("\nModels saved successfully!")


Models saved successfully!


In [45]:
feature_importance = rf_model.feature_importances_
print(f"\nTop 10 most important features:")
top_indices = np.argsort(feature_importance)[-10:][::-1]
for idx in top_indices:
    print(f"Feature {idx}: {feature_importance[idx]:.4f}")


Top 10 most important features:
Feature 74: 0.1835
Feature 42: 0.0940
Feature 78: 0.0594
Feature 36: 0.0593
Feature 81: 0.0575
Feature 63: 0.0566
Feature 1: 0.0382
Feature 68: 0.0356
Feature 65: 0.0239
Feature 19: 0.0234
