In [1]:
import pandas as pd
df = pd.read_csv('heart.csv')
print(df.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


Column /	    Description	Type /                      Values
age-        Age of the patient	                Numeric (years)
sex-        Gender of the patient            	1 = male, 0 = female
cp-         Chest pain                          type	1 = typical angina, 2 = atypical angina, 3 = non-anginal pain, 4 = asymptomatic
trestbps-	Resting blood pressure	            Numeric (mm Hg)
chol-	    Serum cholesterol	                Numeric (mg/dl)
fbs-	    Fasting blood sugar > 120 mg/dl  	1 = true, 0 = false
restecg-	Resting electrocardiographic results	0 = normal, 1 = ST-T wave abnormality, 2 = left ventricular hypertrophy
thalach-	Maximum heart rate achieved	        Numeric
exang-    	Exercise-induced angina         	1 = yes, 0 = no
oldpeak-	ST depression induced by exercise relative to rest	/Numeric (continuous)
slope-   	Slope of the peak exercise ST segment   /1 = upsloping, 2 = flat, 3 = downsloping
ca-	        Number of major vessels colored by fluoroscopy /	0–3 (integer)
thal-	    Thalassemia type	                3 = normal, 6 = fixed defect, 7 = reversible defect
target-   	Presence of heart disease	        1 = disease, 0 = no disease   

In [2]:
import pandas as pd
import numpy as np

nan_counts = df.isnull().sum()
print(nan_counts)


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


Initial Model Development

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define features and target
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.9853658536585366


 Model Fine-Tuning

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy: 0.9853658536585366


Calculating ( R^2 ) using Scikit-learn

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Assuming X is your feature dataframe and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

# Alternatively, using the model's score method:
r2_alternative = model.score(X_test, y_test)
print("R^2 Score using model's method:", r2_alternative)

R^2 Score: 0.4096080106078557
R^2 Score using model's method: 0.4096080106078557


In [11]:
%pip install xgboost

import xgboost as xgb
from sklearn.metrics import accuracy_score

# Use existing X_train, X_test, y_train, y_test

# Initialize the model
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   -- ------------------------------------- 3.9/56.8 MB 26.0 MB/s eta 0:00:03
   -------- ------------------------------- 11.8/56.8 MB 28.3 MB/s eta 0:00:02
   ------------- -------------------------- 18.6/56.8 MB 30.1 MB/s eta 0:00:02
   ------------------ --------------------- 25.7/56.8 MB 30.7 MB/s eta 0:00:02
   ---------------------- ----------------- 32.5/56.8 MB 31.2 MB/s eta 0:00:01
   --------------------------- ------------ 39.6/56.8 MB 31.1 MB/s eta 0:00:01
   -------------------------------- ------- 46.4/56.8 MB 31.4 MB/s eta 0:00:01
   ------------------------------------- -- 53.5/56.8 MB 31.5 MB/s eta 0:00:01
   ---------------------------------------  56.6/56.8 MB 31.4 MB/s eta 0:00:01
   ---------------------------------------  56.6/56.8 MB 31.4 MB/s eta 0:

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.9853658536585366


In [13]:
from sklearn.model_selection import cross_val_score

# Evaluate using cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated accuracy:", cv_scores.mean())

Cross-validated accuracy: 0.9804878048780488


Feature Importance in Tree-based Models

In [14]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

model = RandomForestClassifier()
model.fit(X_train, y_train)

# Extracting feature importances
feature_importances = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

          importance
ca          0.134592
cp          0.130877
thalach     0.127453
thal        0.117010
oldpeak     0.110621
chol        0.082608
age         0.077983
trestbps    0.070774
exang       0.051797
slope       0.044339
sex         0.026600
restecg     0.016739
fbs         0.008608


Using Logistic Regression with Feature Coefficients

In [15]:
from sklearn.linear_model import LogisticRegression
import numpy as np

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Coefficients of the features
coefficients = pd.DataFrame(model.coef_.T, index=X.columns, columns=['coefficient'])
coefficients['absolute_coefficient'] = np.abs(coefficients['coefficient'])
coefficients.sort_values('absolute_coefficient', ascending=False, inplace=True)

print(coefficients)

          coefficient  absolute_coefficient
sex         -1.717145              1.717145
thal        -1.071030              1.071030
cp           0.846679              0.846679
exang       -0.833437              0.833437
ca          -0.814664              0.814664
oldpeak     -0.679565              0.679565
slope        0.536938              0.536938
restecg      0.260720              0.260720
fbs         -0.160987              0.160987
thalach      0.028928              0.028928
trestbps    -0.018156              0.018156
chol        -0.008878              0.008878
age          0.000678              0.000678


Correlation Analysis

In [17]:
correlation_matrix = X.join(y).corr()
print(correlation_matrix['target'].sort_values(ascending=False))

target      1.000000
cp          0.434854
thalach     0.422895
slope       0.345512
restecg     0.134468
fbs        -0.041164
chol       -0.099966
trestbps   -0.138772
age        -0.229324
sex        -0.279501
thal       -0.337838
ca         -0.382085
exang      -0.438029
oldpeak    -0.438441
Name: target, dtype: float64


Chest pain type, 	Maximum heart rate achieved, Slope of the peak exercise ST segment, Resting electrocardiographic results and ST depression induced by exercise relative to rest are indicators then when high or combined will lead to heart disease or better said heart disese is already present.