In [None]:
pip install kaggle

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d andrewmvd/heart-failure-clinical-data

In [None]:
!unzip /content/heart-failure-clinical-data.zip -d /content/heartfailure/

Archive:  /content/heart-failure-clinical-data.zip
replace /content/heartfailure/heart_failure_clinical_records_dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("/content/heartfailure/heart_failure_clinical_records_dataset.csv")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
fig, ax = plt.subplots(figsize=(4, 5))
plt.scatter(df['age'],df['DEATH_EVENT'])

In [None]:
sns.barplot(x=df['age'],y=df['DEATH_EVENT'])

In [None]:
df.columns

In [None]:
sns.barplot(x=df['age'],y=df['sex'],hue='DEATH_EVENT',data=df)

In [None]:
df.info()

In [None]:
df.isnull().any()

In [None]:
df.describe()

In [None]:
fig = plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,fmt='.1f')

In [None]:
df.drop(['time','ejection_fraction','serum_sodium'],axis=True,inplace=True)

In [None]:
df.head(5)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop(['DEATH_EVENT'],axis=1)
y = df.DEATH_EVENT

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=42)

In [None]:
# Load Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb



In [None]:
#  model evaluation function
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [None]:
from pandas.plotting import scatter_matrix
from matplotlib import pyplot

In [None]:
from xgboost.sklearn import XGBClassifier

models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('KNN',KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('ADA', AdaBoostClassifier()))
models.append(('XGB', XGBClassifier()))

In [None]:
results =[]
names = []

for name, model in models:
    kfold = StratifiedKFold(n_splits=4, random_state=1, shuffle=True)
    cv_result = cross_val_score(model, x_train,y_train, cv=kfold , scoring='accuracy')
    results.append(cv_result)
    names.append(name)
    print("cv_result", cv_result)
    print('%s: %f (%f)'% (name, cv_result.mean(), cv_result.std()))
    print("")

In [None]:
# compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

In [None]:
# Make predictions on test dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
model = LogisticRegression()
model.fit(x_train_scaled, y_train)

In [None]:
prediction = model.predict(x_test)

In [None]:
# Evaluate predictions
print(accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
# Make predictions on test dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
model = RandomForestClassifier(n_estimators=200,random_state=42)
model.fit(x_train_scaled, y_train)

In [None]:
prediction = model.predict(x_test)

In [None]:
# Evaluate predictions
print(accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()
param_grid = [
    {"n_estimators": [10, 100, 200, 500], "max_depth": [None, 5, 10], "min_samples_split": [2, 3, 4]}
]

grid_search = GridSearchCV(clf, param_grid, cv=3, scoring="accuracy", return_train_score=True)

grid_search.fit(x_train_scaled, y_train)


In [None]:
prod_final_clf = grid_search.best_estimator_

In [None]:
prod_final_clf

In [None]:
predictions = prod_final_clf.predict(x_test)

In [None]:
predictions

In [None]:
n_estimators_values = [10, 50, 100, 150, 200]
accuracy_scores = []
for n_estimators in n_estimators_values:
    # Create the Random Forest Classifier
    rf_classifier = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

    # Train the model on the training data
    rf_classifier.fit(x_train, y_train)

    # Make predictions on the test data
    y_pred = rf_classifier.predict(x_test)

    # Calculate the accuracy of the model and append to the list
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

# Plot the accuracy scores
plt.plot(n_estimators_values, accuracy_scores, marker='o')
plt.xlabel('Number of Trees (n_estimators)')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Number of Trees in Random Forest')
plt.grid(True)
plt.show()