In [None]:
#libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
#preprocess
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
#models
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
#check
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
#save
import pickle as pk

In [None]:
#!pip install xgboost

In [None]:
#pip install imbalanced-learn

In [None]:
#!pip install seaborn

In [None]:
df = pd.read_csv('/kaggle/input/hart-deasease-dataset/Heart Disease.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

# Explore Data

In [None]:
df.info()

In [None]:
df.columns

# Check Null Values

In [None]:
df.isna().sum()

In [None]:
#all rows control for null values
df.isna().values.any()

In [None]:
df['HeartDisease'].value_counts()

# Duplicated values

In [None]:
df.duplicated().sum()

In [None]:
#drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
df.shape

# Data Analysis & Visualization  using Seaborn Library, pyplot

In [None]:
#show Ratio of HeartDisease from data
plt.pie(x = df['HeartDisease'].value_counts() ,autopct='%1.2f%%' ,labels=df['HeartDisease'].value_counts().index)
plt.title('the Ratio of Heart Disease')
plt.show

In [None]:
#show how many of Male and female 
sns.countplot(x = df['Sex'], palette='GnBu_d')
plt.show()

In [None]:
df.groupby(['Sex','HeartDisease'])['HeartDisease'].count()

In [None]:
#show Heart Disease Frequency for Sex
plt.figure(figsize=(10,5))
sns.countplot(x = df['Sex'], data=df, hue='HeartDisease',palette="winter")
plt.title('Heart Disease Frequency for Sex')
plt.xlabel('Sex')
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency')
plt.show()

In [None]:
#AgeCategory frequency
df['AgeCategory'].value_counts()

In [None]:
df.groupby(['AgeCategory','HeartDisease'])['HeartDisease'].count()

In [None]:
#show Heart Disease Frequency for AgeCategory
plt.figure(figsize=(14,7))
sns.countplot(x='AgeCategory', data=df, hue='HeartDisease',palette="winter")
plt.title('Heart Disease Frequency for AgeCategory')
plt.xlabel('AgeCategory')
plt.legend(["Haven't Disease", "Have Disease"])
plt.ylabel('Frequency')
plt.show()

In [None]:
#the count of people Smoking
df['Smoking'].value_counts()

In [None]:
df.groupby(['Sex','AgeCategory','Smoking','HeartDisease'])['HeartDisease'].count()

In [None]:
#show Heart Disease Frequency for Smoking
plt.figure(figsize=(10,5))
sns.countplot(x='Smoking',data=df,hue='HeartDisease',palette='winter')
plt.title('Heart Disease of Smoking people')
plt.legend(["Haven't Disease", "Have Disease"])
plt.xlabel('Smoking')
plt.show()

In [None]:
# I want to Know how many people drink the AlcoholDrinking or not 
df['AlcoholDrinking'].value_counts()

In [None]:
df.groupby(['AlcoholDrinking','HeartDisease'])['HeartDisease'].count()

In [None]:
df.groupby(['Sex','AgeCategory','AlcoholDrinking','PhysicalActivity','HeartDisease'])['HeartDisease'].count()

In [None]:
#Show how many people drink the AlcoholDrinking or not 
plt.figure(figsize=(10,6))
sns.countplot(x=df['AlcoholDrinking'],data=df , hue='HeartDisease',palette='winter')
plt.title('people drinking the AlcoholDrinking')
plt.legend(["Haven't Disease", "Have Disease"])
plt.xlabel('AlcoholDrinking')
plt.show()

In [None]:
#I want to know how many people have KidneyDisease
df['KidneyDisease'].value_counts()

In [None]:
#show the peolple who have KidneyDisease and HeartDisease
df.groupby(['KidneyDisease','HeartDisease'])['HeartDisease'].count()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot( x = 'KidneyDisease', data=df, hue='HeartDisease',palette=['blue','green'])
plt.title('people have KidneyDisease')
plt.legend(["Haven't Disease", "Have Disease"])
plt.show()

In [None]:
df.groupby(['Sex','KidneyDisease'])['KidneyDisease'].count()

In [None]:
#Show the SkinCancer  
df['SkinCancer'].value_counts()

In [None]:
#Show people have SkinCancer
df.groupby(['Sex','SkinCancer'])['SkinCancer'].count()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Sex',data=df, hue='SkinCancer',palette='winter')
plt.title('people have SkinCancer')
plt.show()

In [None]:
df.groupby(['Sex','SkinCancer','HeartDisease'])['HeartDisease'].count()

In [None]:
sns.countplot(x = 'SkinCancer', data=df , hue = 'HeartDisease')
plt.title('people have SkinCancer and HeartDisease')
plt.legend(["Haven't Disease", "Have Disease"])
plt.show()

# Check the Distribution and skewness of the features

In [None]:
sns.pairplot(data = df , hue= 'HeartDisease')
plt.legend('HeartDisease')

In [None]:
label = LabelEncoder()

# Iterate over each column in the DataFrame
for col in df:
    # Apply label encoding to the column
    df[col] = label.fit_transform(df[col])

df

In [None]:
x = df.drop('HeartDisease',axis=1)
y = df[['HeartDisease']]

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size=.75,random_state=42) 

# Over Sampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Create an instance of RandomOverSampler
over = RandomOverSampler(random_state=42)

# Perform oversampling on x and y
x_new, y_new = over.fit_resample(x, y)

# Hold Out Cross Validation

In [None]:
# Split the data into training and test sets
xtrain, xtest, ytrain, ytest = train_test_split(x_new, y_new, train_size=0.75, random_state=42) 

# Create a Decision Tree Classifier
dt = DecisionTreeClassifier() 

# Fit the classifier to the training data
dt.fit(xtrain, ytrain) 

# Evaluate the model's accuracy on the test data
test_score= dt.score(xtest, ytest)  # after sampling
print("Accuracy on test data:", test_score)

# Evaluate the model's accuracy on the training data
train_score = dt.score(xtrain, ytrain)  # after sampling
print("Accuracy on training data:", train_score)


# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of RandomForestClassifier
rf = RandomForestClassifier()

# Train the random forest classifier on the training data
rf.fit(xtrain, ytrain)

# Calculate and print the accuracy score on the test data
test_score = rf.score(xtest, ytest)
print("Accuracy on test data:", test_score)

# Calculate and print the accuracy score on the training data
train_score = rf.score(xtrain, ytrain)
print("Accuracy on training data:", train_score)

In [None]:
# Create an instance of XGBClassifier
xgb = XGBClassifier()

# Fit the model on the training data
xgb.fit(xtrain, ytrain)

# Calculate the accuracy score on the training data
train_score = xgb.score(xtrain, ytrain)
print("Training accuracy:", train_score)

# Calculate the accuracy score on the test data
test_score = xgb.score(xtest, ytest)
print("Testing accuracy:", train_score)

# Without sampling

In [None]:
# Split the data into training and testing sets
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(x, y, train_size=0.75, random_state=42)

# Fit the decision tree model on the training data
dt.fit(xtrain1, ytrain1)

# Evaluate the model's performance on the testing data
test_score= dt.score(xtest1, ytest1)  # imbalanced old data
print("Accuracy on testing data:", test_score)

# K Fold Cross Validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score

# Create KFold object with 5 splits and shuffling
kfold = KFold(n_splits=5, shuffle=True)

# Perform cross-validation using KFold
result = cross_val_score(xgb, x_new, y_new, cv=kfold) 

# Print the cross-validation scores
result= result
print('result:', result)

# Calculate the mean of the cross-validation scores
mean_score= result.mean()
print('mean score:', mean_score)

# Get the maximum value from the cross-validation scores
max_score= result.max()
print('max score:', max_score)

# Get the minimum value from the cross-validation scores
min_score= result.min()
print('min score:', min_score)

In [None]:
Clf = DecisionTreeClassifier()
# Train the random forest classifier on the training data
Clf.fit(xtrain, ytrain)
# Calculate and print the accuracy score on the test data
test_score= Clf.score(xtest, ytest)  # after sampling
print("Accuracy on testing data:", test_score)
score= Clf.score(x, y)
print("Accuracy on testing data:", score)

# post pornong

In [None]:
from sklearn import tree

In [None]:
clf = DecisionTreeClassifier()
clf.fit(xtrain, ytrain)

In [None]:
plt.figure(figsize=(15,10))
tree.plot_tree(clf,filled=True, feature_names = x_new.columns)

In [None]:
pruning_path = clf.cost_complexity_pruning_path(xtrain, ytrain) # clf = DecisionTreeClassifier 
ccp_alpha_values = pruning_path.ccp_alphas

In [None]:
ccp_alpha_values

In [None]:
clfs_list = []  # save diffrent classifiers

for x in ccp_alpha_values:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=x)
    clf.fit(xtrain, ytrain)
    clfs_list.append(clf)
    


In [None]:
clfs_list

# Compare

In [None]:
train_scores = [clf.score(xtrain, ytrain) for clf in clfs_list]  
test_scores = [clf.score(xtest, ytest) for clf in clfs_list]

In [None]:
train_scores

In [None]:
test_scores

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")

ax.plot(ccp_alpha_values, train_scores, marker='o', label="training accuracy", drawstyle="steps-post")
ax.plot(ccp_alpha_values, test_scores, marker='o', label="testing accuracy", drawstyle="steps-post")
ax.legend()

In [None]:
# clf2 = DecisionTreeClassifier(random_state=42, ccp_alpha=0.070)

In [None]:
#clf2.fit(xtrain,ytrain)

In [None]:
#pred=clf.predict(xtest)

In [None]:
#accuracy_score(ytest, pred)