<a href="https://colab.research.google.com/github/Mahalakshmi-2-9/TNSDC_Generative-AI/blob/main/Heart_Disease_Regression_and_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'personal-key-indicators-of-heart-disease:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1936563%2F6674905%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240330%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240330T063445Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D640c54a424962f0c786af74f21b8ea37614f0bbb1e8750230f43bd813fa37109c84f01d9ab5ba4f71e74a8e4c7166dfb288ba8f210e57daed81729414b7a2bba930718b22fda2b3f0771c28784622a7613508512b771bb0af40d74dbe3d78a3c3ea2b3c59d5e55e6015c9c2c0a95a120826884c934309a698907214a975dd6de48c3c1df59646cbef0e6137dc0e31aa9fb9079b55cbb0846df31acd4c03c676b3c6406ba99c9586a988cb43413befd1dc207ada16f69e61561684b9166e574324d911554e705f865a2d1a43180798215cafc1053d3f51da37ec4ca1e9c0a4676d2691c64f4baca3d8ffcc21ecc02eeadba40c3ac0bd328f5c45565f719379f26'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Regression and Analysis of the Heart Disease Data

This has 3 parts
1. Build a logistic model to predict the heart disease based on the dataset provided.
2. Analyse the correlation between Heart disease, kidney disease, diabetes and skin cancer. See if there is a relation between them.
3. Understand the influence of BMI on the diseases.

## 1. Build a logistic model to predict the heart disease based on the dataset provided.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the csv file and check the top 5 rows
heart = pd.read_csv("/kaggle/input/personal-key-indicators-of-heart-disease/heart_2020_cleaned.csv")
heart.head()

In [None]:
# Check for null values
heart.isnull().any()
heart.shape

In [None]:
heart.Diabetic.value_counts()

In [None]:
# Since we want to diagnose as Yes for heart-disease, even if there is a slightest chance, let us consider borderline diabetic and pregnancy diabetic as No.
heart['Diabetic'] = heart.Diabetic.apply(lambda x: 'No' if x == 'No, borderline diabetes' or x == 'Yes (during pregnancy)' else x)
heart.Diabetic.value_counts()

In [None]:
# Check for null values
heart.isnull().any()

### Data analysis of categorical features

In [None]:
# Notice that we have a few categorical variables with Yes/No values. Let us convert them to binary 1/0
column_yesno = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Diabetic", "PhysicalActivity", "Asthma", "KidneyDisease", "SkinCancer"]

heart[column_yesno] = heart[column_yesno].apply(lambda x: x.map({'Yes':1,'No':0}))
heart.head()

In [None]:
# Check the data
heart.info()
heart_orig = heart

In [None]:
# We can find 4 more categorical variables which are not yes/no. The dtype is object for these 4. Let us convert them into dummy variables.
categoricals = heart.select_dtypes(include=['object'])
categoricals.head()
cat_dummies = pd.get_dummies(categoricals, drop_first=True)
cat_dummies.head()
# Drop the redundant columns
heart.drop(list(categoricals.columns), axis=1, inplace=True)
# concat the heart and dummies data frames.
heart = pd.concat([heart, cat_dummies], axis=1)
heart.head()

In [None]:
# Here we are interested in the chance of having the heart disease based on other variables.
# Let y be HeartDisease column
y = heart.pop('HeartDisease')
X = heart
X.head()

### Split the data into train and test

In [None]:
# Split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
X_train.head()

In [None]:
len(X_train.columns)

In [None]:
# Scale X variables - should have done before spliting
from sklearn.preprocessing import scale
# rescale the features of Xtrain
cols = X_train.columns
X_train = pd.DataFrame(scale(X_train))
X_train.columns = cols
# rescale the features of Xtest
cols = X_test.columns
X_test = pd.DataFrame(scale(X_test))
X_test.columns = cols

In [None]:
?RFE

In [None]:
# Let us use RFE to check required features and remove multicolearity
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Running RFE with the output number of the variable equal to 10
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, n_features_to_select=30)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
rfe_columns = X_train.columns[rfe.support_]
rfe_columns

In [None]:
# Creating X_train dataframe with RFE selected variables
X_train_rfe = X_train[rfe_columns]
X_train_rfe.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (20,10))
sns.heatmap(X_train_rfe.corr(),annot = True)
plt.show()

In [None]:
# build a linear model using statsmodel

import statsmodels.api as sm
logm1 = sm.GLM(list(y_train),(sm.add_constant(X_train_rfe)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
X_train_rfe = X_train_rfe.drop(['AgeCategory_35-39'], axis=1)
X_train_rfe.columns

In [None]:
# build a linear model using statsmodel

import statsmodels.api as sm
logm1 = sm.GLM(list(y_train),(sm.add_constant(X_train_rfe)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
X_train_rfe = X_train_rfe.drop(['Race_White'], axis=1)
X_train_rfe.columns

import statsmodels.api as sm
logm1 = sm.GLM(list(y_train),(sm.add_constant(X_train_rfe)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
# Check for the VIF values of the feature variables.

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()

vif['Features'] = X_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

No VIF seems to be above 5

### Residual analysis of training data

In [None]:
lm.fit(X_train_rfe, y_train)
y_train_pred = lm.predict(X_train_rfe)
y_train_pred

In [None]:

y_train_pred_final = pd.DataFrame({'Actual':y_train, 'HeartDisease_Prob':y_train_pred})
y_train_pred_final.head()

In [None]:
# Let's create columns with different probability cutoffs
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.HeartDisease_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives
from sklearn import metrics

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Actual, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1

    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# 0.1 seems to be a good cut off in this case.
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

### Test our model and predict y_test

In [None]:
cutoff = 0.2
y_train_pred_final['final_pred'] = y_train_pred_final.HeartDisease_Prob.apply(lambda x: 1 if x>cutoff else 0)
y_train_pred_final.head()

## Lets make prediction on test data now

In [None]:
# Adding a constant variable
X_test_new = X_test[rfe_columns]
X_test_new.drop(['AgeCategory_35-39','Race_White'], axis=1, inplace=True)
X_test_new.head()

In [None]:
# Make predictions
y_test_pred = lm.predict(X_test_new)
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_df = pd.DataFrame(y_test_pred)
y_pred_df.head()
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)
y_test_df.head()

In [None]:
# Removing index for both dataframes to append them side by side
y_pred_df.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_df],axis=1)
y_pred_final.head()

In [None]:
# Rename the column
y_pred_final = y_pred_final.rename(columns={0: 'HeartDidease_Prob'})
y_pred_final.head()

In [None]:
# Set final prediction based on cutoffof 0.1
y_pred_final['final_prediction'] = y_pred_final['HeartDidease_Prob'].apply(lambda x: 1 if x>cutoff else 0)
y_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_pred_final.HeartDisease, y_pred_final.final_prediction )
confusion

In [None]:
TP = confusion[1,1] # true positive
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Accuracy
(TP+TN)/(TP+TN+FP+FN)

## Conclusion

### The model seems to be have decent sensitivity and specificity. Hence this model seems to be reliable.

* Sensitivity is 50%
* Specificity is 90%
* Accuracy is 86%

## 2. Analyse the correlation between various diseases

In [None]:
# Let us quickly check the heat map for original data
plt.figure(figsize = (20,10))
sns.heatmap(heart_orig.corr(),annot = True)
plt.show()

In [None]:
# Let us consider only the disease features
diseases = heart_orig[['HeartDisease', 'Stroke', 'Diabetic', 'KidneyDisease', 'SkinCancer']]
diseases.head()

In [None]:
# check a heat map
plt.figure(figsize = (20,10))
sns.heatmap(diseases.corr(),annot = True)
plt.show()

### Observations
* Heart disease and stroke are correlated
* Diabetes is correlated with heart disease and Kidney disease. Less with Stroke.
* Skin cancer is not correlated with other 4 diseases.

In [None]:
# Let us understand if diabetes is influencing heart disease.
diabetics = heart_orig[heart_orig.Diabetic==1]
heart_d = sum(diabetics.HeartDisease)/len(diabetics)
print("The number of people with diabetes also having heart disease are")
print(heart_d)

heart_patients = heart_orig[heart_orig.HeartDisease==1]
diabs = sum(heart_patients.Diabetic)/len(heart_patients)
print("The number of people with heart disease also having diabetes are")
print(diabs)


One in three people (33%) having the heart disease are having diabetes. This can indicate that diabetes could be one of the main reasons of heart disease.

In [None]:
# Let us understand if diabetes is influencing kidney disease.
diabeticsk = heart_orig[heart_orig.Diabetic==1]
kidney_d = sum(diabeticsk.KidneyDisease)/len(diabeticsk)
print("The number of people with diabetes also having kidney disease are")
print(kidney_d)

kidney_patients = heart_orig[heart_orig.KidneyDisease==1]
diabsk = sum(kidney_patients.Diabetic)/len(kidney_patients)
print("The number of people with kidney disease also having diabetes are")
print(diabsk)

Kidney disease is even more influenced by diabetes. Nearly 40% are having diabetes. This surely indicates that diabetes has a role to play in the kidney disease.

## 3. Understand the influence of BMI on the diseases.

In [None]:
diseases_bmi = heart_orig[['HeartDisease', 'Stroke', 'Diabetic', 'KidneyDisease', 'SkinCancer', 'BMI']]
diseases_bmi.head(20)

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(diseases_bmi.corr(), annot=True)
plt.show()

There is some hint here due to correlation between BMI and Diabetes, that higher BMI can cause Diabetes. Diabetes inturn causes heart disease and kidney disease as we learnt in previous analysis..