In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
df = pd.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')
df.head()

In [5]:
def data_explore(dataframe):
    print("DATA EXPLORATION")
    print('*'*70)
    print("Shape of dataset : ",dataframe.shape)
    print('*'*70)
    print(dataframe.info())
    print('*'*70)
    print("STATISTICAL ANALYSIS OF NUMERICAL DATA")
    print('*'*70)
    print(dataframe.describe().T)
    print('*'*70)
    print("STATISTICAL ANALYSIS OF CATEGORICAL DATA")
    print('*'*70)
    print(dataframe.describe(exclude = ['float', 'int64']).T)
    print('*'*70)
    print("MISSING VALUES")
    print('*'*70)
    print(dataframe.isna().sum().sort_values(ascending=False))
    print('*'*70)
    print("MISSING VALUES IN %")
    print('*'*70)
    print(round(100* (dataframe.isnull().sum() / len(dataframe)).sort_values(ascending=False),2))
    print('*'*70)

In [6]:
data_explore(df)

In [7]:
cat_cols = [col for col in df.columns if df[col].dtypes =='O']

In [8]:
fig, axes = plt.subplots(2, 2, figsize=(15,10))
sns.countplot(x = df['RestingECG'], data = df, palette='husl', ax=axes[0,0])
sns.countplot(x = df['ChestPainType'], data = df, palette='husl', ax=axes[0,1])
sns.countplot(x = df['ExerciseAngina'], data = df, palette='husl', ax=axes[1,0])
sns.countplot(x = df['Sex'], data = df, palette='husl', ax=axes[1,1])
plt.show()

In [9]:
num_col = [feature for feature in df.columns if df[feature].dtypes != 'O']

In [10]:
plt.figure(figsize=(6,3), dpi=150)
sns.heatmap(df.corr(), annot=True)

In [11]:
df.corr()['HeartDisease'].sort_values(ascending=False)

In [12]:
for col in num_col:
    plt.figure(figsize=(5,3), dpi=150)
    sns.histplot(df[col])

In [15]:
df=pd.get_dummies(df)

In [17]:
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, plot_confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import GridSearchCV

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [26]:
pipeline = Pipeline([
            ('std_scaler',StandardScaler()),
                    ])
scaled_X_train = pipeline.fit_transform(X_train)
scaled_X_test = pipeline.transform(X_test)

In [28]:
model_svc = SVC()
model_svc.fit(scaled_X_train,y_train)

In [29]:
model_svc.get_params()

In [30]:
pred_svc = model_svc.predict(scaled_X_test)
accuracy_score(y_test,pred_svc)

In [31]:
print(classification_report(y_test,pred_svc))

In [41]:
svm = SVC()
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
grid = GridSearchCV(svm,param_grid)

In [35]:
grid.fit(scaled_X_train,y_train)

In [36]:
grid_svc = grid.predict(scaled_X_test)
accuracy_score(y_test,grid_svc)

In [37]:
print(classification_report(y_test,grid_svc))

In [38]:
print(grid.best_params_)
print(grid.best_estimator_.get_params())