In [1]:
#Importing required Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

### 1. Importing, Understanding, and Inspecting Data

In [2]:
#Importing the Data
df = pd.read_excel("C:/Users/MK064837/Google Drive/Simplilearn/DataSets/Heartattack/data.xlsx")
df.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


##### Variable Description
- age--------age in years
- sex--------(1 = male; 0 = female)
- cp---------chest pain type
- trestbps---resting blood pressure (in mm Hg on admission to the hospital)
- chol-------serum cholestoral in mg/dl
- fbs--------fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg----resting electrocardiographic results
- thalach----maximum heart rate achieved
- exang------exercise induced angina (1 = yes; 0 = no)
- oldpeak----ST depression induced by exercise relative to rest
- slope------the slope of the peak exercise ST segment
- ca---------number of major vessels (0-3) colored by flourosopy
- thal-------3 = normal; 6 = fixed defect; 7 = reversable defect

In [3]:
#Structure of Data
df.shape

(303, 14)

In [4]:
#Checking Null values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
#Checking duplicates value
print("Total no of Duplicate values: ",df.duplicated().sum()

SyntaxError: unexpected EOF while parsing (<ipython-input-5-3105e30982db>, line 2)

In [None]:
#Dropping duplicates
df = df.drop_duplicates()
df.duplicated().sum()

In [None]:
#Statistical analysis of dataset
df.describe()

### Conclusions
- The structure of data say it has 14 features and 303 records
- The data set has no missing values
- 1 duplicate record is found adn was dropped

### 2. Performing EDA and Modeling:



In [None]:
#Data preprocessing for plotting
df2 = df.copy()
def chng(sex):
    if sex == 0:
        return "Female"
    else:
        return "Male"
df2['sex'] = df2['sex'].apply(chng)
def chng2(prob):
    if prob == 0:
        return "HeartDisease"
    else:
        return "No Heart Disease"
df2['target'] = df2['target'].apply(chng2)

In [None]:
#Categorical value analysis
f, axes = plt.subplots(2,4,figsize=(17,6))
sns.histplot(data=df['sex'], ax=axes[0,0])
sns.histplot(data=df['cp'], ax=axes[0,1])
sns.histplot(data=df['fbs'], ax=axes[0,2])
sns.histplot(data=df['restecg'], ax=axes[0,3])
sns.histplot(data=df['exang'], ax=axes[1,0])
sns.histplot(data=df['slope'], ax=axes[1,1])
sns.histplot(data=df['ca'], ax=axes[1,2])
sns.histplot(data=df['thal'], ax=axes[1,3])
plt.show()

In [None]:
#CVD across different ages
plt.figure(figsize=(20,6))
sns.countplot(data=df, x='age', hue='target')

#### Conclusion
- CVD is more prominent in people whose age lies between 40-60

In [None]:
#Identifying outliers in resting blood pressure
plt.figure(figsize=(20,4))
sns.boxplot(x=df['trestbps'])

#### Conclusion
- Outliers are observed after 170mg HG

In [None]:
#Heart attack based on anomalies in resting blood pressure of the patient
plt.figure(figsize=(20,4))
print("Total affected in trestbps outliers :",sum((df.trestbps[df.target==1]>=170)))
sns.countplot(x=df.trestbps[df.target==1])
plt.show()

### Conclusion
- We wont be able to to precisely detect the heart attack on anamalies in resting blood pressure as the records in that range(170-200) are insignificant in detection adn cannot confirm that cvd is prominent in that range

In [None]:
#Composition of overall patients w.r.t gender
print("Total womens affected :",sum((df.sex[df.target==1]==0)))
print("Total Mens affected :",sum((df.sex[df.target==1]==1)))
sns.countplot(data= df2, x='sex',hue='target')
plt.title('Gender v/s Target\n')
plt.show()

#### Conclusion
- Mens are more affected by CVD

In [None]:
#Relationship between cholesterol levels and our target variable.
sns.displot(data=df, x='chol',hue='target',bins=20)
plt.show()

### Conclusion
- People with low cholestrol level(200-300) were are more prone to CVD

In [None]:
#Relationship between peak exercising and occurrence of heart attack
sns.countplot(data=df2, x='slope', hue='target')
print("Correlation of peak exercising and CVD :", df.slope.corr(df.target))
plt.show()

#### Conclusion
- People who do more exercise have less chances of getting CVD

In [None]:
#thalassemia a major cause of CVD?
sns.countplot(data=df2, x='thal', hue='target')
print("Corelation between thal and occurance of CVD :", df['thal'].corr(df['target']))
plt.show()

#### Conclusion
- Thalesemia is not a major cause of CVD and the correlation between the target variable and thal is negative

In [None]:
#Relationship between all the given variable
plt.figure(figsize=(20,12))
print("The below heatmap shows the correlation of each variable wrt to other features \n")
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
#Pairplot
sns.pairplot(df)

In [None]:
#Data Scaling (Since the input features have numeric range variation we will use scaling )
Scale = StandardScaler()
X = df.drop(columns='target')
y = df['target']
X = Scale.fit_transform(X)
y = df['target']

In [None]:
# Train and test splitting data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
Model = LogisticRegression()
Model.fit(X_train,y_train)

In [None]:
#predicting on test data
y_pred = Model.predict(X_test)
print("Prediction on test data : \n" , y_pred)

In [None]:
#Validation
print("Confusion Matrix is :")
print(confusion_matrix(y_test,y_pred))
print("Accurancy Score is -", accuracy_score(y_test,y_pred))