In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statistics as st
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_excel('1645792390_cep1_dataset.xlsx')

## (1)PRELIMINARY DATA ANALYSIS

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.head(15)

In [None]:
df.tail()

## (2A)PRELIMINARY STATISTICAL SUMMARY

In [None]:
df.describe()

## (2B)IDENTIFICATION OF CATEGORICAL AND NUMERICAL COLUMNS

#### From the given information, the categorical columns are 'sex','exang','thal' and 'target' respectively

In [None]:
sns.countplot(df['sex'])

In [None]:
sns.countplot(df['exang'])

In [None]:
sns.countplot(df['thal'])

In [None]:
sns.countplot(df['target'])

## (2C)STUDY OF CVD ACROSS AGE CATEGORY

In [None]:
K=df['age']
k=df['target']

In [None]:
sns.boxplot(k,K)
plt.Figure(figsize=(25,35))

## (2D)COMPOSITION OF PATIENTS WRT TO SEX CATEGORY

In [None]:
data=df['sex'].value_counts()

In [None]:
data

In [None]:
plt.pie(data,labels=('male','female'),autopct='%1.0f%%')

# HEATMAP SHOWING THE CORRELATION OF VARIOUS FEATURES

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr(),annot=True)

## (2E)RELATION BETWEEN CVD AND BLOOD PRESSURE AT REST

In [None]:
X=df['target']
Y=df['trestbps']

In [None]:
sns.boxplot(X,Y)

#### THIS SHOWS THAT THE RESTING BLOOD PRESSURE DOESN'T HAVE MUCH EFFECT ON THE CVD ALSO THE CORRELATION BETWEEN REST BP AND TARGET FROM HEATMAP IS 0.14 WHICH SHOWS THAT THE ONE CANNOT DETECT HEARTATTACKS PURELY BASED ON THE BLOOD PRESSURE AT REST

## (2F)RELATION BETWEEN CVDS AND CHOLESTROL LEVELS

In [None]:
X=df['target']
Y=df['chol']

In [None]:
sns.boxplot(X,Y)
plt.figure(figsize=(35,25))

#### HENCE,FROM THE ABOVE PLOT AND CORRELATION(HEATMAP=0.085) IT CAN BE CONCLUDED THAT THE RISK OF CVDs IS UNAFFECTED BY CHOLESTROL LEVELS IN NORMAL RANGE BUT ALSO IN THE HIGHER CHOLESTROL RANGES, THE RISK IS PRONE

## (2H)RELATION BETWEEN PEAK EXERCISING AND CVDs

#### from the data oldpeak represents the decrease in heartrate at rest(thalach) and is noticeably correlated to risk of CVDs(target) in a negative manner(c=-0.43),from this it can be concluded that they are negatively correlated and exercising might reduce the risk of CVDs

In [None]:
X=df['target']
Y=df['oldpeak']
plt.plot(X,Y)

In [None]:
sns.boxplot(X,Y)

## (2H)RELATION BETWEEN THALASSEMIA AND CVDs

#### FROM HEATMAP THE CORRELATION BETWEEN THE TWO VARIABLES IS -0.34 i.e. THALASSEMIA SIGNIFICANTLY AFFECTS CVDs

## (2I)OTHER FACTORS AFFECTING CVDs

##### From the HEATMAP,the various factors and their effect on CVDs can be depicted,The factors that affect the cardio vascular health the most are
###### POSITIVE FACTORS(NEGATIVE CORRELATION)
1. exang-exercise induced angina
2. oldpeak-st depression caused due to peak exercising
3. ca-number of major vessels
4. thal-thalassemia type
5. age-age of patients
###### NEGATIVE FACTORS(POSITIVE CORRELATION)
1. cp-chest pain type
2. thalach-max heartrate observed
3. slope-Slope of the peak exercise ST segment

## (2J)PAIRPLOTS

In [None]:
sns.pairplot(df)

# MODEL BUILDING USING RANDOM FOREST AND LOGISTIC REGRESSION AND THEIR COMPARISON

In [None]:
df

In [None]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
from sklearn.tree import  DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
model1=DecisionTreeClassifier()
model2=KNeighborsClassifier()
model3=LogisticRegression()

In [None]:
model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

In [None]:
pred1=model1.predict(x_test)
pred2=model2.predict(x_test)
pred3=model3.predict(x_test)

In [None]:
final_predict=np.array([])
for i in range(0,len(x_test)):
    final_predict=np.append(final_predict,st.mode([pred1[i],pred2[i],pred3[i]]))
print(final_predict)

In [None]:
from sklearn import metrics
confusion_matrix=metrics.confusion_matrix(y_test,final_predict)
confusion_matrix = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[False, True])
confusion_matrix.plot()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,final_predict)

In [None]:
from sklearn.metrics import precision_score
precision_score(y_test,final_predict, average=None)

In [None]:
f1score=metrics.f1_score(y_test,final_predict)
print(f1score)

In [None]:
pred3

In [None]:
confusion_matrix=metrics.confusion_matrix(y_test,pred3)
confusion_matrix = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[False, True])
confusion_matrix.plot()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred3)

In [None]:
from sklearn.metrics import precision_score
precision_score(y_test,final_predict, average=None)

In [None]:
f1score=metrics.f1_score(y_test,pred3)
print(f1score)