In [1]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations and array handling
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For feature scaling and encoding categorical variables
from xgboost import XGBClassifier  # Import XGBoost classifier for model training
from sklearn.metrics import accuracy_score  # For evaluating model performance

In [2]:
# Step 1: Load the dataset
df = pd.read_csv('cardio_train.csv', sep=';')  # Read the dataset with a semicolon separator

In [3]:
# Step 2: Data Inspection
print(df.head())  # Display the first few rows of the dataset
print(df.isnull().sum())  # Check for missing values
print(df.describe())  # Get summary statistics of numerical features
print(df.dtypes)  # Check data types of all columns

   id    age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393       2     168    62.0    110     80            1     1      0   
1   1  20228       1     156    85.0    140     90            3     1      0   
2   2  18857       1     165    64.0    130     70            3     1      0   
3   3  17623       2     169    82.0    150    100            1     1      0   
4   4  17474       1     156    56.0    100     60            1     1      0   

   alco  active  cardio  
0     0       1       0  
1     0       1       1  
2     0       0       1  
3     0       1       1  
4     0       0       0  
id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64
                 id           age        gender        height        weight  \
count  70000.000000  70000.000000  70000.000000  7

In [4]:
# Step 3: Data Cleaning
df.dropna(inplace=True)  # Remove rows with missing values
df.drop_duplicates(inplace=True)  # Remove duplicate rows to avoid redundant data

In [5]:
# Step 4: Feature Engineering
df['bmi'] = df['weight'] / (df['height'] / 100) ** 2  # Calculate Body Mass Index (BMI)
df['age_years'] = df['age'] / 365  # Convert age from days to years

In [6]:
# Step 5: Encoding Categorical Variables
label_encoder = LabelEncoder()  # Initialize LabelEncoder
df['gender'] = label_encoder.fit_transform(df['gender'])  # Encode gender as numeric values

In [7]:
# Step 6: Normalization/Standardization
scaler = StandardScaler()  # Initialize StandardScaler
numerical_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']  # Define numerical columns to scale
df[numerical_features] = scaler.fit_transform(df[numerical_features])  # Apply standardization

In [8]:
# Step 7: Splitting the Dataset
X = df.drop('cardio', axis=1)  # Define features (independent variables)
y = df['cardio']  # Define target variable (dependent variable)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split into training and testing sets

In [9]:
# Step 8: Train the XGBoost Model
# Initialize the XGBoost classifier
xgb_model = XGBClassifier(
    n_estimators=100,  # Number of boosting rounds
    learning_rate=0.1,  # Learning rate
    max_depth=5,  # Maximum depth of a tree
    random_state=42,  # Random seed for reproducibility
    objective='binary:logistic'  # Binary classification objective
)

In [10]:
# Train the model
xgb_model.fit(X_train, y_train)  # Fit the model to the training data

In [11]:
# Step 9: Evaluate the Model
# Predict on the test set
y_pred = xgb_model.predict(X_test)  # Generate predictions for test data

In [12]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)  # Compute model accuracy
print(f"Accuracy of the XGBoost model: {accuracy * 100:.2f}%")  # Print accuracy percentage

Accuracy of the XGBoost model: 74.08%


In [13]:
# Step 10: Save the Model (Optional)
import joblib  # For saving and loading trained models
joblib.dump(xgb_model, 'xgboost_cardio_model.pkl')  # Save the trained model to a file

['xgboost_cardio_model.pkl']