In [None]:
import pandas as pd
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
from tkinter import *
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('heart.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
#check for duplicate values
data_dup = data.duplicated().any()
print(data_dup)

In [None]:
#drop duplicate values
data = data.drop_duplicates()

In [None]:
data.shape

In [None]:
data.describe() #statistical summary

In [None]:
#correlation matrix
plt.figure(figsize=(17,6))
sns.heatmap(data.corr(),annot=True)

In [None]:
data['target'].value_counts() #to check how many people actually have disease

In [None]:
sns.countplot(x=data["target"])

In [None]:
data['sex'].value_counts() #counting total number of males and females

In [None]:
sns.countplot(x=data["sex"])
plt.xticks([0,1],['Female','Male'])
plt.show()

In [None]:
#Find Gender distribution according to target
sns.countplot(x='sex', hue='target', data=data)
plt.xticks([1,0],['Male','Female'])
plt.legend(labels=['No Disease', 'Disease'])
plt.show()

In [None]:
#check age distribution
sns.distplot(data['age'])
plt.show()

In [None]:
#Check chest pain type
#0: Typical angina
#1: Atypical angina
#2: Non-anginal pain
#3: Asymptomatic

In [None]:
sns.countplot(x=data["cp"])
plt.xticks([0,1,2,3],['Typical angina','Atypical angina','Non-anginal pain','Asymptomatic'])
plt.xticks(rotation=25)
plt.show()

In [None]:
sns.countplot(x='cp', hue='target', data=data)
plt.xticks([0,1,2,3],['Typical angina','Atypical angina','Non-anginal pain','Asymptomatic'])
plt.xticks(rotation=25)
plt.legend(labels=['No Disease', 'Disease'])
plt.show()

In [None]:
sns.countplot(x=data["fbs"], hue=data["target"])
plt.xticks([0,1],['<120 mg/dl','>120 mg/dl'])
plt.legend(labels=['No Disease', 'Disease'])
plt.show()

In [None]:
data['trestbps'].hist() #resting blood pressure

In [None]:
#Facet Grid tool used to check for the distribution between two variables which is a subset of the data
g = sns.FacetGrid(data,hue='sex', aspect=4)
g.map(sns.kdeplot, 'trestbps', fill=True)
plt.legend(labels=['Male','Female'])

In [None]:
#show distribution of cholesterol
data['chol'].hist()

In [None]:
#seperating catgorical and continuous variables
categorical_val = []
cont_val = []
for column in data.columns:
    if data[column].nunique() <= 10:
        categorical_val.append(column)
    else:
        cont_val.append(column)

In [None]:
categorical_val

In [None]:
cont_val

In [None]:
data.hist(cont_val, figsize=(10,6))
plt.tight_layout()

In [None]:
#Encoding Categorical Data by creating dummy variables fro eg: 
'''
cp: 0 1 2 3
    1 0 0 0
    0 1 0 0
    0 0 1 0
    0 0 0 1
Dummy variable trap:It is a scenario in which two or more variables are highly correlated; in simple terms one variable can be predicted from the others. To prevent this we will remove the first column from dummy variables.
'''

In [None]:
#removing the below two columns because they only have 2 values 
categorical_val.remove('target')
categorical_val.remove('sex')
#getting dummies
data = pd.get_dummies(data, columns=categorical_val, drop_first=True) #drop_first to remove dummy variable trap

In [None]:
data.head()