<a href="https://colab.research.google.com/github/MileneBedouhene/Heart-Disease-Prediction/blob/main/Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
import os
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d sulianova/cardiovascular-disease-dataset

In [None]:
!unzip cardiovascular-disease-dataset.zip -d ./data

#### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Import Data

In [None]:
train = pd.read_csv('cardio_train.csv')
train.head()

In [None]:
train.info()

In [None]:
train.describe()

#### Data Preprocessing

In [None]:
train.drop('Unnamed: 0', axis=1, inplace=True)
train.drop('id', axis=1, inplace=True)

In [None]:
train["Age"] = (train["age"] / 365).round(0)
train.to_csv("train_data.csv", index=False)

In [None]:
train.drop('age', axis=1, inplace=True)

In [None]:
train.drop('gender', axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
train.dropna(inplace = True)

In [None]:
for column in train :
  train[column] = train[column].round().astype(int)


In [None]:
train.head()

#### Data Visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='cardio', y='ap_hi', data=train)
plt.title('Blood pressure according to cardio')
plt.xlabel('Cardio')
plt.ylabel('Blood Pressure')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='cardio', y='Age', data=train)
plt.title('Age according to cardio')
plt.xlabel('Cardio')
plt.ylabel('Age')
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Metrix')
plt.show()

#### Handling Outliers

In [None]:
Q1 = train['ap_hi'].quantile(0.25)
Q3 = train['ap_hi'].quantile(0.75)
IQR = Q3 - Q1
train = train[~((train['ap_hi'] < (Q1 - 1.5 * IQR)) | (train['ap_hi'] > (Q3 + 1.5 * IQR)))]

#### Splitting The Data

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('cardio', axis=1)
y = train['cardio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Training The Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)

accuracy = accuracy_score(y_test,y_pred)

print("Accuracy:",accuracy)

In [None]:
test = pd.read_csv('cardio_test.csv')
test.head()

In [None]:
id = test['id']

In [None]:
test.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
test.drop('gender', axis=1, inplace=True)

In [None]:
test.head()

In [None]:
test["Age"] = (test["age"] / 365).round(0)
test.to_csv("cardio_test.csv", index=False)
test.drop('age', axis=1, inplace=True)

In [None]:
test.drop('cardio', axis = 1, inplace = True)

In [None]:
for column in test :
  test[column] = test[column].round().astype(int)

In [None]:
test_predictions = model.predict(test)

submission = pd.DataFrame({'ID': id, 'cardio': test_predictions})

submission.to_csv('submission.csv', index=False)