In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess the data
df = pd.read_csv('../Lab/data/cardio_train.csv', sep=';', index_col='id')
df['age'] = round(df['age'] / 365)
df.rename(columns={'ap_hi': 'systolic', 'ap_lo': 'diastolic', 'cardio': 'cardio_disease',
                   'gluc': 'glucose_level', 'alco': 'alcohol_intake', 'active': 'physical_activity',
                   'smoke': 'smoking', 'cholesterol': 'cholesterol_level'}, inplace=True)

# Exploratory Data Analysis (EDA)
print(f"Positive cases: {df['cardio_disease'].sum()}")
print(f"Negative cases: {len(df) - df['cardio_disease'].sum()}")

cholesterol_counts = df['cholesterol_level'].value_counts(normalize=True) * 100
print(f"Cholesterol Levels - Normal: {cholesterol_counts[1]:.2f}%, Above Normal: {cholesterol_counts[2]:.2f}%, "
      f"Well Above Normal: {cholesterol_counts[3]:.2f}%")

smokers_ratio = df['smoking'].mean() * 100
print(f"Percentage of Smokers: {smokers_ratio:.2f}%")

# Visualize age distribution
plt.figure(figsize=(16, 6))
sns.histplot(df, x='age', bins=30, element='step', color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Preprocessing: Handling Outliers
df['weight'] = np.where((df['weight'] < 60) | (df['weight'] > 140), df['weight'].median(), df['weight'])
df['height'] = np.where((df['height'] < 150) | (df['height'] > 200), df['height'].median(), df['height'])

# Feature Engineering: BMI Calculation
df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
df['bmi'] = round(df['bmi'], 1)

# Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".1f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Model Design and Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Prepare data
X, y = df.drop('cardio_disease', axis=1), df['cardio_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model pipeline
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
param_grid = {'randomforestclassifier__n_estimators': [100, 200],
              'randomforestclassifier__max_depth': [10, 20, None]}
grid = GridSearchCV(pipeline, param_grid, cv=5)
grid.fit(X_train, y_train)

# Results
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Save the model
import joblib
joblib.dump(best_model, 'cardio_model.pkl')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


FileNotFoundError: [Errno 2] No such file or directory: '../Lab/data/cardio_train.csv'