In [1]:
import pandas as pd  # Used for data manipulation and analysis, ideal for working with tabular data.
import numpy as np  # Adds support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
import matplotlib.pyplot as plt # Used for creating static, animated, and interactive visualizations in Python.
import seaborn as sns # Used for making statistical graphics in Python.
from sklearn.model_selection import train_test_split # Used to split the dataset into training and testing data.
from sklearn.svm import SVC # Used to implement the Support Vector Classification model.
from sklearn.metrics import accuracy_score # Used to calculate the accuracy of the model.

In [2]:
#Read the csv file after preprocessing data
dataset = pd.read_csv('data/2_resampled_heart_attack_data.csv') # Read the csv file
dataset # Display the dataset

Unnamed: 0,Sex,Cholesterol,Diabetes,Family History,Smoking,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Medication Use,...,Triglycerides,Activity per Week,Sleeping Hours,Country,Systolic,Diastolic,Age_HeartRate,Obesity_SedentaryHours,Continent_Hemisphere,Heart Attack Risk
0,0.730390,-0.666696,-1.235978,-0.884788,0.373960,-1.088738,-1.051743,-1.162827,-0.886866,-0.896666,...,-0.608436,-1.543183,-0.476434,-1.661879,0.898740,0.211343,0.416582,0.036891,1.949352,0
1,0.730390,1.658948,0.809076,1.130214,0.373960,0.918495,-1.475444,1.356333,1.127566,-0.896666,...,-0.843166,-1.089327,0.046286,-1.124857,1.174744,0.566306,-0.993175,-0.156160,1.034020,0
2,-1.369131,0.823772,0.809076,-0.884788,-2.674084,-1.088738,-1.427745,0.096753,1.127566,1.115243,...,0.776937,0.272241,-1.521874,-0.587836,1.529607,0.992261,-1.271456,0.880876,0.118689,0
3,0.730390,1.581855,0.809076,1.130214,0.373960,0.918495,-0.033410,-1.162827,1.127566,-0.896666,...,-0.185000,-0.181615,-1.521874,-1.124857,1.095886,1.063253,1.083234,0.343258,1.034020,0
4,0.730390,0.746679,0.809076,1.130214,0.373960,-1.088738,-0.757375,1.356333,1.127566,-0.896666,...,-0.861577,-1.089327,-0.999154,1.202234,-1.743019,0.211343,1.086292,-1.177987,-0.796643,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11243,-1.369131,-0.576754,0.809076,1.130214,0.373960,-1.088738,0.199719,0.096753,-0.886866,-0.896666,...,-0.520987,0.272241,0.569007,-1.124857,-0.875576,0.282335,0.846746,-0.669007,-0.338977,1
11244,0.730390,0.502551,0.809076,1.130214,0.373960,0.918495,-1.178101,0.096753,1.127566,-0.896666,...,0.712501,-1.543183,-1.521874,0.128192,1.017027,-0.214612,0.576109,-0.902567,-0.338977,1
11245,-1.369131,0.502551,-1.235978,-0.884788,0.373960,0.918495,-0.491387,0.096753,-0.886866,-0.896666,...,0.556013,0.726097,0.569007,1.023227,0.977598,-1.066523,0.266228,-0.010180,-1.254308,1
11246,-1.369131,1.427668,-1.235978,-0.884788,0.373960,-1.088738,-0.717980,0.096753,-0.886866,-0.896666,...,-0.493371,-1.543183,1.091727,1.023227,-1.112151,0.921268,1.157646,-0.137321,-0.796643,1


In [3]:
X = dataset.drop('Heart Attack Risk', axis=1)   # X is the input data
y = dataset['Heart Attack Risk']    # y is the output data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the dataset into training and testing data

In [4]:
from sklearn.model_selection import GridSearchCV # Used to perform hyperparameter tuning of the model

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100], # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'], # Kernel type
    'gamma': [0.001, 0.01, 0.1, 1], # Kernel coefficient
    'degree': [2, 3, 4] # Degree of the polynomial kernel
}

# Initialize the SVM model
svm_model = SVC()

# Initialize the grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f'Best hyperparameters: {best_params}') # Display the best hyperparameters

# Get the best estimator (model with best hyperparameters)
best_svm_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_svm_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the best SVM model: {accuracy:.2f}') # Display the accuracy of the model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming that 'data' is your DataFrame and 'Risk' is your target variable

# Analyze the distribution of the target variable
print(data['Risk'].value_counts())

# Visualize the distribution of the target variable
sns.countplot(x='Risk', data=data)
plt.show()

# Analyze the correlation between different features and the target variable
correlation = data.corr()
print(correlation['Risk'])

# Visualize these correlations using a heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.show()

# Visualize the distribution of key features with respect to the target variable
# Replace 'feature1', 'feature2' with your actual feature names
for feature in ['feature1', 'feature2']:
    sns.boxplot(x='Risk', y=feature, data=data)
    plt.show()