In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
data = pd.read_csv("Dataset.csv")

# Check the shape of the dataset
print("Dataset Shape Before Dropping 'id':", data.shape)

# Drop the 'id' column if it exists
if 'id' in data.columns:
    data.drop('id', axis=1, inplace=True)
    print("'id' column removed.")


Dataset Shape Before Dropping 'id': (3390, 17)
'id' column removed.


In [3]:
# Check the shape of the dataset after dropping the column
print("Dataset Shape After Dropping 'id':", data.shape)

# Check the dataset info
data.info()

# Count unique values in each column
print("Unique Values in Each Column:")
print(data.nunique())

# Check for missing values
print("Missing Values:")
print(data.isnull().sum())

# Percentage of missing values
missing_percentage = (data.isnull().sum() / len(data)) * 100
print("Percentage of Missing Values:")
print(missing_percentage)


Dataset Shape After Dropping 'id': (3390, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3390 entries, 0 to 3389
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              3390 non-null   int64  
 1   education        3303 non-null   float64
 2   sex              3390 non-null   object 
 3   is_smoking       3390 non-null   object 
 4   cigsPerDay       3368 non-null   float64
 5   BPMeds           3346 non-null   float64
 6   prevalentStroke  3390 non-null   int64  
 7   prevalentHyp     3390 non-null   int64  
 8   diabetes         3390 non-null   int64  
 9   totChol          3352 non-null   float64
 10  sysBP            3390 non-null   float64
 11  diaBP            3390 non-null   float64
 12  BMI              3376 non-null   float64
 13  heartRate        3389 non-null   float64
 14  glucose          3086 non-null   float64
 15  TenYearCHD       3390 non-null   int64  
dtypes: float64(9),

In [4]:
# Fill missing values
data['BMI'].fillna(data['BMI'].mean(), inplace=True)
data['education'].fillna(data['education'].mode()[0], inplace=True)
data['glucose'].fillna(data['glucose'].mode()[0], inplace=True)
data['totChol'].fillna(data['totChol'].mode()[0], inplace=True)
data['BPMeds'].fillna(data['BPMeds'].mode()[0], inplace=True)
data['cigsPerDay'].fillna(data['cigsPerDay'].mode()[0], inplace=True)
data['heartRate'].fillna(data['heartRate'].mode()[0], inplace=True)

# Check for missing values again
print("Missing Values After Imputation:")
print(data.isnull().sum())

# Statistical summary of the data
print("Data Description:")
print(data.describe(include='all').T)

Missing Values After Imputation:
age                0
education          0
sex                0
is_smoking         0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64
Data Description:
                  count unique  top  freq       mean        std    min    25%  \
age              3390.0    NaN  NaN   NaN  49.542183   8.592878   32.0   42.0   
education        3390.0    NaN  NaN   NaN   1.946018   1.017568    1.0    1.0   
sex                3390      2    F  1923        NaN        NaN    NaN    NaN   
is_smoking         3390      2   NO  1703        NaN        NaN    NaN    NaN   
cigsPerDay       3390.0    NaN  NaN   NaN   9.010619  11.862839    0.0    0.0   
BPMeds           3390.0    NaN  NaN   NaN   0.029499   0.169224    0.0    0.0   
prevalentStroke  3390.0    NaN  N

In [5]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Impute missing values using the most frequent strategy (redundant but safe for categorical columns)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [6]:
# Split features (X) and target (y)
X = data.drop('TenYearCHD', axis=1)  # Replace 'TenYearCHD' with your actual target column
y = data['TenYearCHD']

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE for class balancing
from imblearn.over_sampling import SMOTE
from collections import Counter

print("Before SMOTE:", Counter(y_train))
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train_resampled))

Before SMOTE: Counter({0.0: 2303, 1.0: 409})
After SMOTE: Counter({0.0: 2303, 1.0: 2303})


In [7]:
# GridSearchCV for Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params_dt = {"criterion": ["gini", "entropy"], "max_depth": [None, 10, 20]}
dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(dt, params_dt, cv=5, scoring="accuracy", n_jobs=-1)
grid_dt.fit(X_train_resampled, y_train_resampled)

# Print best parameters and score
print("Best Parameters (Decision Tree):", grid_dt.best_params_)
print("Best Score (Decision Tree):", grid_dt.best_score_)

Best Parameters (Decision Tree): {'criterion': 'gini', 'max_depth': None}
Best Score (Decision Tree): 0.8261330582385927


In [8]:
import pickle

# Save the trained model
with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(grid_dt.best_estimator_, file)
print("Model saved successfully!")


Model saved successfully!


In [9]:
import pickle

# Load the model
with open('decision_tree_model.pkl', 'rb') as file:
    model = pickle.load(file)
print("Model loaded successfully!")


Model loaded successfully!


In [10]:
# Ensure your input data has the same columns as the training data, in the correct order
input_df = pd.DataFrame([{
    'age': 45,
    'education': 2,  # example encoding for 'education' if categorical
    'sex': 1,  # example encoding for 'sex' if it was categorical
    'is_smoking': 1,
    'cigsPerDay': 15,
    'BPMeds': 1,
    'prevalentStroke': 0,
    'prevalentHyp': 0,
    'diabetes': 0,
    'totChol': 200,
    'sysBP': 130,  # Ensure 'sysBP' is included
    'diaBP': 80,
    'BMI': 29.5,
    'heartRate': 72,
    'glucose': 90,
}], columns=['age', 'education', 'sex', 'is_smoking', 'cigsPerDay', 'BPMeds',
             'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
             'diaBP', 'BMI', 'heartRate', 'glucose'])  # Column order matches training order

# Make predictions using the trained model
predictions = model.predict(input_df)

# Display the prediction
print("Prediction:", predictions)


Prediction: [0.]


In [11]:
# Get the column names from your training data
feature_columns = X_train.columns
print("Feature columns used during training:", feature_columns)


Feature columns used during training: Index(['age', 'education', 'sex', 'is_smoking', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose'],
      dtype='object')
