In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Import data and process data

In [122]:
# Read the dataset into the dataframe
df = pd.read_csv('/content/diabetic_data.csv')

## **EDA**

In [None]:
df.shape # Check the size of the dataset

In [None]:
df.info() # Check the overall information

In [None]:
df.isnull().sum() # Check the missing values

In [None]:
df.duplicated().sum() # Check the duplicate values

In [None]:
for i in df.select_dtypes(include='object').columns:
  print(df[i].value_counts())
  print('---' * 10)

In [None]:
df.describe().T # Check the numerical data values

Numerical data Distribution

In [None]:
# Histogram to understand the distribution, although i don't get anything 😁
# for i in df.select_dtypes(include='number').columns:
#   sns.histplot(data = df, x = i)
#   plt.show()

# old code made chat make each 3 tables next to each other

import math

numeric_columns = df.select_dtypes(include='number').columns
n_cols = 3  # Change to 2 if you want 2 per row
n_rows = math.ceil(len(numeric_columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()  # Flatten in case of multi-row layout

for i, col in enumerate(numeric_columns):
    sns.histplot(data=df, x=col, ax=axes[i])
    axes[i].set_title(col)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


Age and Gender Count

In [None]:
# Age and Gender Count
age_count = df["age"].value_counts()
gender_count = df["gender"].value_counts()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
age_count.plot(kind='bar', ax=ax1, color='skyblue')
gender_count.plot(kind='bar', ax=ax2, color='lightcoral')

Race and Weight Count

In [None]:
# Race and Weight Count
race_count = df["race"].value_counts()
weight_count = df["weight"].value_counts()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
race_count.plot(kind='bar', ax=ax1, color='mediumseagreen')
weight_count.plot(kind='bar', ax=ax2, color='goldenrod')

Readmission Rate Count

In [None]:
#Readmission Rate Count
target_count = df['readmitted'].value_counts()
target_count.plot(kind = 'bar', title= 'Readmission Count')

In [None]:
# correlation heatmap to visualize how strongly numerical featuresare related to each other.
corr = df.select_dtypes(include = 'number').corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df.columns # To divide them into diagnosis_cols and

Diagnosis vs Readmission

In [None]:
diagnosis_cols = ['diag_1', 'diag_2', 'diag_3']

# Diagnosis vs Readmission
for diag in diagnosis_cols:
    plt.figure(figsize=(10, 5))
    top10 = df[diag].value_counts().iloc[:10].index  # Top 10 frequent diagnoses
    sns.countplot(data=df[df[diag].isin(top10)], x=diag, hue='readmitted')
    plt.title(f'{diag} vs Readmission')
    plt.xticks(rotation=45)
    plt.show()

Medications vs Readmission

In [None]:
medication_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'examide', 'citoglipton', 'insulin',
    'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone',
    'metformin-pioglitazone'
]

# Medications vs Readmission
for med in medication_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df, x=med, hue='readmitted')
    plt.title(f'{med} Usage vs Readmission')
    plt.xticks(rotation=45)
    plt.show()


# **Data Processing**

In [62]:
missing_info = []

# Check missing values for object columns
for col in df.select_dtypes(include = 'object').columns:
        count_missing = df[col][df[col] == '?'].count()
        percent_missing = (count_missing / df.shape[0] * 100).round(2)
        missing_info.append([col, count_missing, percent_missing])

# Create DataFrame from collected missing info
missing_value = pd.DataFrame(missing_info, columns=["col", "count_missing", "percent_missing"])
missing_value = missing_value.sort_values(by="percent_missing", ascending=False)

missing_value

Unnamed: 0,col,count_missing,percent_missing
3,weight,98569,96.86
5,medical_specialty,49949,49.08
4,payer_code,40256,39.56
0,race,2273,2.23
8,diag_3,1423,1.4
7,diag_2,358,0.35
6,diag_1,21,0.02
1,gender,0,0.0
2,age,0,0.0
9,max_glu_serum,0,0.0


In [63]:
# Drop 3 columns with too many missing '?'
df.drop(['weight', 'payer_code', 'medical_specialty'], axis=1, inplace=True)

# Drop rows based on multiple cleaning conditions:
df = df.drop(df[
    # Drop if all three diagnosis columns are missing ('?')
    ((df['diag_1'] == '?') & (df['diag_2'] == '?') & (df['diag_3'] == '?')) |

    # Drop specific 'admission_type_id' values: 5 (Not Available), 6 (NULL), 8 (Not Mapped)
    (df['admission_type_id'].isin([5, 6, 8])) |

    # Drop 'discharge_disposition_id' values indicating death or irrelevant outcomes
    (df['discharge_disposition_id'].isin([11, 13, 14, 18, 19, 20, 21, 25, 26])) |

    # Drop 'admission_source_id' values that are unavailable, NULL, or not mapped
    (df['admission_source_id'].isin([9, 15, 17, 20, 21]))
].index)


In [64]:
df.shape # Check th dimension now

(84377, 47)

In [65]:
# Drop missing values in gender
df = df.drop(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))

# Drop missing values in race
df = df.drop(set(df['race'][df['race']=='?'].index))

In [66]:
df.shape # Check th dimension now

(82367, 47)

In [67]:
# Drop irrelevant variables
df = df.drop(["encounter_id","patient_nbr"],axis=1)
df = df.drop(["citoglipton","examide"],axis = 1)

In [68]:
df.shape # Check th dimension now

(82367, 43)

In [69]:
df['readmitted'].value_counts() # Just to check the difference

Unnamed: 0_level_0,count
readmitted,Unnamed: 1_level_1
NO,43322
>30,29592
<30,9453


In [70]:
df.isnull().sum()

Unnamed: 0,0
race,0
gender,0
age,0
admission_type_id,0
discharge_disposition_id,0
admission_source_id,0
time_in_hospital,0
num_lab_procedures,0
num_procedures,0
num_medications,0


In [71]:
# Drop max_glu_serum because it's almost entirely missing
df.drop('max_glu_serum', axis=1, inplace=True)

# Fill A1Cresult missing values with 'None'
df['A1Cresult'] = df['A1Cresult'].fillna('None')

In [72]:
df.shape # Check th dimension now

(82367, 42)

 Decode ID Columns Using Mapping:

In [73]:
# Decoding IDs using mapping file
admission_type_map = {
    1: 'Emergency',
    2: 'Urgent',
    3: 'Elective',
    4: 'Newborn',
    7: 'Trauma Center',
}

discharge_disposition_map = {
    1: 'Discharged to home',
    2: 'Discharged/transferred to another short term hospital',
    3: 'Discharged/transferred to SNF',
    4: 'Discharged/transferred to ICF',
    5: 'Discharged/transferred to another type of inpatient care institution',
    6: 'Discharged/transferred to home with home health service',
    7: 'Left AMA',
    8: 'Discharged/transferred to home under care of Home IV provider',
    9: 'Admitted as an inpatient to this hospital',
    10: 'Neonate discharged to another hospital for neonatal aftercare',
    12: 'Still patient or expected to return for outpatient services',
    15: 'Discharged/transferred within this institution to Medicare approved swing bed',
    16: 'Discharged/transferred/referred another institution for outpatient services',
    17: 'Discharged/transferred to a psychiatric hospital of psychiatric distinct part unit of a hospital',
    22: 'Discharged/transferred to another rehab facility including rehab units of a hospital',
    23: 'Discharged/transferred to a long term care hospital',
    24: 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare',
}

admission_source_map = {
    1: 'Physician Referral',
    2: 'Clinic Referral',
    3: 'HMO Referral',
    4: 'Transfer from a hospital',
    5: 'Transfer from a Skilled Nursing Facility (SNF)',
    6: 'Transfer from another health care facility',
    7: 'Emergency Room',
    8: 'Court/Law Enforcement',
    10: 'Transfer from critical access hospital',
    11: 'Normal Delivery',
    12: 'Premature Delivery',
    13: 'Sick Baby',
    14: 'Extramural Birth',
}

df['admission_type_id'] = df['admission_type_id'].replace(admission_type_map)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(discharge_disposition_map)
df['admission_source_id'] = df['admission_source_id'].replace(admission_source_map)

In [74]:
# Adding some important features
df['total_visits'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']
df['missing_diag'] = ((df['diag_1'] == '?') | (df['diag_2'] == '?') | (df['diag_3'] == '?')).astype(int)

# List of medication columns
# Create a new variable to calculate the change of medication dose
meds_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide',
    'metformin-pioglitazone', 'metformin-rosiglitazone', 'glimepiride-pioglitazone',
    'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide'
]

# Create new binary columns: 1 if med was changed (up or down), 0 if no change or steady (Some encoding)
for col in meds_cols:
    new_col = f"{col}_new"
    df[new_col] = df[col].apply(lambda x: 0 if x in ['No', 'Steady'] else 1)

# Sum up all the med changes to create a 'med_change' feature
df['med_change'] = df[[f"{col}_new" for col in meds_cols]].sum(axis=1)

# Drop the temporary binary columns
df.drop(columns=[f"{col}_new" for col in meds_cols], inplace=True)

In [75]:
df['med_change'].value_counts()

Unnamed: 0_level_0,count
med_change,Unnamed: 1_level_1
0,59636
1,21531
2,1098
3,97
4,5


In [76]:
# Recode medication columns: 'No' → 0, all others ('Steady', 'Up', 'Down') → 1
for col in meds_cols:
    df[col] = df[col].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})

# Create 'num_med' feature: sum across the medication columns
df['num_med'] = df[meds_cols].sum(axis=1)

  df[col] = df[col].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})


In [77]:
df['num_med'].value_counts()

Unnamed: 0_level_0,count
num_med,Unnamed: 1_level_1
1,38061
0,18295
2,18152
3,6627
4,1175
5,55
6,2


In [78]:
# calculate the outcome variable readmission
df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

In [79]:
# Encode Age
df['age_num'] = df['age'].str.extract('(\d+)', expand=False).astype(int) + 5

In [80]:
df['age_num'].value_counts()

Unnamed: 0_level_0,count
age_num,Unnamed: 1_level_1
75,20557
65,18467
55,14259
85,13655
45,7907
35,3163
95,2158
25,1429
15,627
5,145


In [81]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'total_visits', 'missing_diag', 'med_change', 'num_med', 'age_num'],
      dtype='object')

In [82]:
df['high_A1C_flag'] = df['A1Cresult'].isin(['>7', '>8']).astype(int)

In [83]:
from sklearn.preprocessing import LabelEncoder

# Label Encode 'race', 'A1Cresult', 'max_glu_serum'
label_cols = ['race', 'A1Cresult']

# Initialize LabelEncoder
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [84]:
df['diag_1'].value_counts()

Unnamed: 0_level_0,count
diag_1,Unnamed: 1_level_1
428,5542
414,5392
786,3185
410,2773
486,2763
...,...
976,1
363,1
842,1
V25,1


In [85]:
# Safe numeric conversion first
df['diag_1_num'] = pd.to_numeric(df['diag_1'], errors='coerce')
df['diag_2_num'] = pd.to_numeric(df['diag_2'], errors='coerce')
df['diag_3_num'] = pd.to_numeric(df['diag_3'], errors='coerce')

# Initialize new columns
df['diag_1_code'] = 0
df['diag_2_code'] = 0
df['diag_3_code'] = 0

# Mapping for diag_1
df.loc[df['diag_1'].astype(str).str.startswith('V'), 'diag_1_code'] = 1
df.loc[df['diag_1'].astype(str).str.startswith('E'), 'diag_1_code'] = 2
df.loc[((df['diag_1_num'] >= 390) & (df['diag_1_num'] < 460)) | (df['diag_1_num'] == 785), 'diag_1_code'] = 3
df.loc[((df['diag_1_num'] >= 460) & (df['diag_1_num'] < 520)) | (df['diag_1_num'] == 786), 'diag_1_code'] = 4
df.loc[((df['diag_1_num'] >= 520) & (df['diag_1_num'] < 580)) | (df['diag_1_num'] == 787), 'diag_1_code'] = 5
df.loc[(df['diag_1_num'] >= 250) & (df['diag_1_num'] < 251), 'diag_1_code'] = 6
df.loc[(df['diag_1_num'] >= 800) & (df['diag_1_num'] < 1000), 'diag_1_code'] = 7
df.loc[(df['diag_1_num'] >= 710) & (df['diag_1_num'] < 740), 'diag_1_code'] = 8
df.loc[((df['diag_1_num'] >= 580) & (df['diag_1_num'] < 630)) | (df['diag_1_num'] == 788), 'diag_1_code'] = 9
df.loc[(df['diag_1_num'] >= 140) & (df['diag_1_num'] < 240), 'diag_1_code'] = 10

# Mapping for diag_2
df.loc[df['diag_2'].astype(str).str.startswith('V'), 'diag_2_code'] = 1
df.loc[df['diag_2'].astype(str).str.startswith('E'), 'diag_2_code'] = 2
df.loc[((df['diag_2_num'] >= 390) & (df['diag_2_num'] < 460)) | (df['diag_2_num'] == 785), 'diag_2_code'] = 3
df.loc[((df['diag_2_num'] >= 460) & (df['diag_2_num'] < 520)) | (df['diag_2_num'] == 786), 'diag_2_code'] = 4
df.loc[((df['diag_2_num'] >= 520) & (df['diag_2_num'] < 580)) | (df['diag_2_num'] == 787), 'diag_2_code'] = 5
df.loc[(df['diag_2_num'] >= 250) & (df['diag_2_num'] < 251), 'diag_2_code'] = 6
df.loc[(df['diag_2_num'] >= 800) & (df['diag_2_num'] < 1000), 'diag_2_code'] = 7
df.loc[(df['diag_2_num'] >= 710) & (df['diag_2_num'] < 740), 'diag_2_code'] = 8
df.loc[((df['diag_2_num'] >= 580) & (df['diag_2_num'] < 630)) | (df['diag_2_num'] == 788), 'diag_2_code'] = 9
df.loc[(df['diag_2_num'] >= 140) & (df['diag_2_num'] < 240), 'diag_2_code'] = 10

# Mapping for diag_3
df.loc[df['diag_3'].astype(str).str.startswith('V'), 'diag_3_code'] = 1
df.loc[df['diag_3'].astype(str).str.startswith('E'), 'diag_3_code'] = 2
df.loc[((df['diag_3_num'] >= 390) & (df['diag_3_num'] < 460)) | (df['diag_3_num'] == 785), 'diag_3_code'] = 3
df.loc[((df['diag_3_num'] >= 460) & (df['diag_3_num'] < 520)) | (df['diag_3_num'] == 786), 'diag_3_code'] = 4
df.loc[((df['diag_3_num'] >= 520) & (df['diag_3_num'] < 580)) | (df['diag_3_num'] == 787), 'diag_3_code'] = 5
df.loc[(df['diag_3_num'] >= 250) & (df['diag_3_num'] < 251), 'diag_3_code'] = 6
df.loc[(df['diag_3_num'] >= 800) & (df['diag_3_num'] < 1000), 'diag_3_code'] = 7
df.loc[(df['diag_3_num'] >= 710) & (df['diag_3_num'] < 740), 'diag_3_code'] = 8
df.loc[((df['diag_3_num'] >= 580) & (df['diag_3_num'] < 630)) | (df['diag_3_num'] == 788), 'diag_3_code'] = 9
df.loc[(df['diag_3_num'] >= 140) & (df['diag_3_num'] < 240), 'diag_3_code'] = 10

# Optional: Drop intermediate numeric columns if you want
df.drop(['diag_1_num', 'diag_2_num', 'diag_3_num'], axis=1, inplace=True)

In [86]:
df['diag_1_code'].value_counts()

Unnamed: 0_level_0,count
diag_1_code,Unnamed: 1_level_1
3,24550
0,13485
4,11403
5,7729
6,7307
7,5777
9,4263
8,3965
10,2613
1,1274


In [87]:
df.shape

(82367, 51)

In [88]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'total_visits', 'missing_diag', 'med_change', 'num_med', 'age_num',
       'high_A1C_flag', 'diag_1_code', 'diag_2_code', 'diag_3_code'],
      dtype='object')

In [89]:
num_col = list(set(list(df._get_numeric_data().columns))- {'readmitted'})
num_col

['metformin-rosiglitazone',
 'acarbose',
 'diag_1_code',
 'race',
 'pioglitazone',
 'glipizide',
 'repaglinide',
 'num_procedures',
 'number_emergency',
 'num_med',
 'missing_diag',
 'miglitol',
 'metformin-pioglitazone',
 'glipizide-metformin',
 'insulin',
 'glyburide-metformin',
 'time_in_hospital',
 'A1Cresult',
 'troglitazone',
 'metformin',
 'chlorpropamide',
 'num_lab_procedures',
 'nateglinide',
 'med_change',
 'tolbutamide',
 'glyburide',
 'diag_3_code',
 'number_diagnoses',
 'glimepiride',
 'rosiglitazone',
 'tolazamide',
 'diag_2_code',
 'number_outpatient',
 'high_A1C_flag',
 'age_num',
 'total_visits',
 'acetohexamide',
 'glimepiride-pioglitazone',
 'number_inpatient',
 'num_medications']

In [90]:
def standardize(data):
    return ((data - np.mean(data, axis = 0)) / np.std(data, axis = 0))
# num_col is a list of all numeric features
df[num_col] = standardize(df[num_col])

In [91]:
# Handle outliers for all numerical columns

for col in num_col:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Clip the outliers
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

In [92]:
df.shape

(82367, 51)

In [93]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted',
       'total_visits', 'missing_diag', 'med_change', 'num_med', 'age_num',
       'high_A1C_flag', 'diag_1_code', 'diag_2_code', 'diag_3_code'],
      dtype='object')

In [94]:
# Columns that are categorical (object type) and need encoding
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Apply LabelEncoder to each categorical column
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [95]:
df['readmitted'].value_counts()

Unnamed: 0_level_0,count
readmitted,Unnamed: 1_level_1
0,72914
1,9453


In [96]:
df.isnull().sum() # high_A1C_flag, metformin-rosiglitazone

Unnamed: 0,0
race,0
gender,0
age,0
admission_type_id,0
discharge_disposition_id,0
admission_source_id,0
time_in_hospital,0
num_lab_procedures,0
num_procedures,0
num_medications,0


In [97]:
df = df.drop(['metformin-rosiglitazone', 'high_A1C_flag'], axis = 1)

# **EDA**

In [None]:
# Histogram to understand the distribution, although i don't get anything 😁
for i in df.select_dtypes(include='number').columns:
  sns.histplot(data = df, x = i)
  plt.show()

In [None]:
# Compute correlation matrix
df_corr = df.corr()

non_nan_mask = df_corr.notna().any(axis=1)
df_coll_clean = df_corr.loc[non_nan_mask, non_nan_mask]

# Plot cleaned heatmap
plt.figure(figsize=(25, 15))
sns.heatmap(df_coll_clean, vmax=.8, square=True, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
for i in df.columns:
    train = df[i].value_counts()
    print(train)

# **Modeling**

In [None]:
# Test just see if all of the above came with profit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split features and target
x = df.drop('readmitted', axis=1)
y = df['readmitted']

# Stratified train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=0)

# Combine X and y back together
train_df = pd.concat([x_train, y_train], axis=1)
test_df = pd.concat([x_test, y_test], axis=1)

# Save to CSV
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(x_train, y_train)

# Predict on test set
y_pred = model.predict(x_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print all results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report, roc_auc_score
)

# Step 1: Split features and target
X = df.drop('readmitted', axis=1)
y = df['readmitted']

# Step 2: Stratified Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

# Step 3: Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=0
)
rf_model.fit(X_train, y_train)

# Step 4: Predict
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Needed for ROC-AUC

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Step 6: Print results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
#ROC AUC

# Predict probabilities for the positive class (class 1)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Compute False Positive Rate, True Positive Rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Compute ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.2f})', color='blue')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Print AUC Score
print("ROC AUC Score:", roc_auc)


**SMOTE**

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Assume X, y are your features and target (readmission: 0 or 1)

# Step 1: Split your dataset
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=42
)

# Step 2: Apply SMOTE to the training set only
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Original dataset shape:", y_train.value_counts())
print("Resampled dataset shape:", y_train_resampled.value_counts())


In [None]:

# Step 1: Split features and target
X = df.drop('readmitted', axis=1)
y = df['readmitted']

# Step 2: Stratified Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

# Step 3: Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=0
)
rf_model.fit(x_train_resampled, y_train_resampled)

# Step 4: Predict
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Needed for ROC-AUC

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Step 6: Print results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)


True Negatives (TN): 14,577 — non-readmitted correctly predicted.

False Positives (FP): 6 — very few non-readmitted misclassified as readmitted.

False Negatives (FN): 409 — fewer missed readmitted cases.

True Positives (TP): 1,482 — strong correct predictions for readmitted.

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
#ROC AUC

# Predict probabilities for the positive class (class 1)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Compute False Positive Rate, True Positive Rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Compute ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.2f})', color='blue')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Print AUC Score
print("ROC AUC Score:", roc_auc)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Fit your model

model.fit(x_train_resampled, y_train_resampled)

# Predict on training set
y_train_pred = model.predict(x_train_resampled)

# Predict on test set
y_test_pred = model.predict(X_test)

# Define a function to print metrics
def print_metrics(name, y_true, y_pred):
    print(f"\n{name} Metrics:")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1 Score :", f1_score(y_true, y_pred))

# Compare training and test metrics
print_metrics("Training", y_train_resampled, y_train_pred)
print_metrics("Test", y_test, y_test_pred)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#  Initialize the Decision Tree (ID3-style: uses 'entropy')
dtree = DecisionTreeClassifier(
    criterion='entropy',
     max_depth=4,
    class_weight='balanced',
    random_state=42
)

#  Train on SMOTE-resampled training data
dtree.fit(x_train_resampled, y_train_resampled)

#  Predict on the original test set
y_pred = dtree.predict(X_test)

#  Evaluation
print("Accuracy     :", accuracy_score(y_test, y_pred))
print("Precision    :", precision_score(y_test, y_pred))
print("Recall       :", recall_score(y_test, y_pred))
print("F1 Score     :", f1_score(y_test, y_pred) )




In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Decision Tree)')
plt.show()

In [None]:
feature_importance = pd.DataFrame({
    'Feature': x_train_resampled.columns,
    'Importance': dtree.feature_importances_
}).sort_values(by='Importance', ascending=False).head(10)

plt.figure(figsize=(10,6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Top 10 Important Features - Decision Tree')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(dtree,
          filled=True,
          feature_names=x_train_resampled.columns,
          class_names=['Not Readmitted', 'Readmitted'],
          rounded=True,
          fontsize=10)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the SVM model (RBF kernel is default)
svm_model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',
    random_state=42
)

# Train the model on the SMOTE-resampled training data
svm_model.fit(x_train_resampled, y_train_resampled)

# Predict on the original test set
y_pred = svm_model.predict(X_test)

# Evaluation
print("Accuracy     :", accuracy_score(y_test, y_pred))
print("Precision    :", precision_score(y_test, y_pred))
print("Recall       :", recall_score(y_test, y_pred))
print("F1 Score     :", f1_score(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (SVM)')
plt.show()


In [None]:
from sklearn.metrics import classification_report

print("Classification Report (SVM):\n")
print(classification_report(y_test, y_pred))

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

neg, pos = (y_train_resampled == 0).sum(), (y_train_resampled == 1).sum()
scale_pos_weight = (neg / pos) *4
model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=800,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight
)

model.fit(x_train_resampled, y_train_resampled)

y_pred_proba = model.predict_proba(x_test)[:, 1]

threshold = 0.3
y_pred_thresh = (y_pred_proba >= threshold).astype(int)

print(f"XGBoost Results (Threshold = {threshold}):")
print("Accuracy:", accuracy_score(y_test, y_pred_thresh))
print("Precision:", precision_score(y_test, y_pred_thresh))
print("Recall:", recall_score(y_test, y_pred_thresh))
print("F1 Score:", f1_score(y_test, y_pred_thresh))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


metrics = {
    'Accuracy': 0.6348792035935413,
    'Precision':  0.11443530291697832,
    'Recall': 0.32363828662083555,
    'F1 Score': 0.1690841276419395
}

names = list(metrics.keys())
values = list(metrics.values())

plt.figure(figsize=(8,6))
sns.barplot(x=names, y=values, palette='Blues_d')

plt.ylim(0,1)
plt.title("XGBoost Metrics (Threshold = 0.3)")
plt.ylabel("Score")
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred_thresh)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f"Confusion Matrix (Threshold = {threshold})")
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

plt.plot(recall, precision, marker='.', label='XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid()
plt.show()

In [None]:
xgb.plot_importance(model, importance_type='gain', max_num_features=10)
plt.title('Top 10 Feature Importances')
plt.show()