In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [118]:
# Read the dataset into the dataframe
df = pd.read_csv('/content/diabetic_data.csv')

## **EDA**

In [None]:
df.shape # Check the size of the dataset

In [None]:
df.info() # Check the overall information

In [None]:
df.isnull().sum() # Check the missing values

In [None]:
df.duplicated().sum() # Check the duplicate values

In [None]:
for i in df.select_dtypes(include='object').columns:
  print(df[i].value_counts())
  print('---' * 10)

In [None]:
df.describe().T # Check the numerical data values

In [None]:
# Histogram to understand the distribution, although i don't get anything 😁
for i in df.select_dtypes(include='number').columns:
  sns.histplot(data = df, x = i)
  plt.show()

In [None]:
# Age and Gender Count
age_count = df["age"].value_counts()
gender_count = df["gender"].value_counts()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
age_count.plot(kind='bar', ax=ax1, color='skyblue')
gender_count.plot(kind='bar', ax=ax2, color='lightcoral')

In [None]:
# Race and Weight Count
race_count = df["race"].value_counts()
weight_count = df["weight"].value_counts()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
race_count.plot(kind='bar', ax=ax1, color='mediumseagreen')
weight_count.plot(kind='bar', ax=ax2, color='goldenrod')

In [None]:
#Readmission Rate Count
target_count = df['readmitted'].value_counts()
target_count.plot(kind = 'bar', title= 'Readmission Count')

In [None]:
# Correlation Heatmap
corr = df.select_dtypes(include = 'number').corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df.columns # To divide them into diagnosis_cols and

In [None]:
diagnosis_cols = ['diag_1', 'diag_2', 'diag_3']

# Diagnosis vs Readmission
for diag in diagnosis_cols:
    plt.figure(figsize=(10, 5))
    top10 = df[diag].value_counts().iloc[:10].index  # Top 10 frequent diagnoses
    sns.countplot(data=df[df[diag].isin(top10)], x=diag, hue='readmitted')
    plt.title(f'{diag} vs Readmission')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
medication_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'examide', 'citoglipton', 'insulin',
    'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone',
    'metformin-pioglitazone'
]

# Medications vs Readmission
for med in medication_cols:
    plt.figure(figsize=(8, 5))
    sns.countplot(data=df, x=med, hue='readmitted')
    plt.title(f'{med} Usage vs Readmission')
    plt.xticks(rotation=45)
    plt.show()


# **Data Processing**

In [None]:
missing_info = []

# Check missing values for object columns
for col in df.select_dtypes(include = 'object').columns:
        count_missing = df[col][df[col] == '?'].count()
        percent_missing = (count_missing / df.shape[0] * 100).round(2)
        missing_info.append([col, count_missing, percent_missing])

# Create DataFrame from collected missing info
missing_value = pd.DataFrame(missing_info, columns=["col", "count_missing", "percent_missing"])
missing_value = missing_value.sort_values(by="percent_missing", ascending=False)

missing_value

In [120]:
# Drop 3 columns with too many missing '?'
df.drop(['weight', 'payer_code', 'medical_specialty'], axis=1, inplace=True)

# Drop rows based on multiple cleaning conditions:
df = df.drop(df[
    # Drop if all three diagnosis columns are missing ('?')
    ((df['diag_1'] == '?') & (df['diag_2'] == '?') & (df['diag_3'] == '?')) |

    # Drop specific 'admission_type_id' values: 5 (Not Available), 6 (NULL), 8 (Not Mapped)
    (df['admission_type_id'].isin([5, 6, 8])) |

    # Drop 'discharge_disposition_id' values indicating death or irrelevant outcomes
    (df['discharge_disposition_id'].isin([11, 13, 14, 18, 19, 20, 21, 25, 26])) |

    # Drop 'admission_source_id' values that are unavailable, NULL, or not mapped
    (df['admission_source_id'].isin([9, 15, 17, 20, 21]))
].index)


In [None]:
df.shape # Check th dimension now

In [None]:
# Drop missing values in gender
df = df.drop(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))

# Drop missing values in race
df = df.drop(set(df['race'][df['race']=='?'].index))

In [None]:
df.shape # Check th dimension now

In [123]:
# Drop irrelevant variables
df = df.drop(["encounter_id","patient_nbr"],axis=1)
df = df.drop(["citoglipton","examide"],axis = 1)

In [None]:
df.shape # Check th dimension now

In [None]:
df['readmitted'].value_counts() # Just to check the difference

In [None]:
df.isnull().sum()

In [128]:
# Drop max_glu_serum because it's almost entirely missing
df.drop('max_glu_serum', axis=1, inplace=True)

# Fill A1Cresult missing values with 'None'
df['A1Cresult'] = df['A1Cresult'].fillna('None')

In [None]:
df.shape # Check th dimension now

In [133]:
# Decoding IDs using mapping file
admission_type_map = {
    1: 'Emergency',
    2: 'Urgent',
    3: 'Elective',
    4: 'Newborn',
    7: 'Trauma Center',
}

discharge_disposition_map = {
    1: 'Discharged to home',
    2: 'Discharged/transferred to another short term hospital',
    3: 'Discharged/transferred to SNF',
    4: 'Discharged/transferred to ICF',
    5: 'Discharged/transferred to another type of inpatient care institution',
    6: 'Discharged/transferred to home with home health service',
    7: 'Left AMA',
    8: 'Discharged/transferred to home under care of Home IV provider',
    9: 'Admitted as an inpatient to this hospital',
    10: 'Neonate discharged to another hospital for neonatal aftercare',
    12: 'Still patient or expected to return for outpatient services',
    15: 'Discharged/transferred within this institution to Medicare approved swing bed',
    16: 'Discharged/transferred/referred another institution for outpatient services',
    17: 'Discharged/transferred to a psychiatric hospital of psychiatric distinct part unit of a hospital',
    22: 'Discharged/transferred to another rehab facility including rehab units of a hospital',
    23: 'Discharged/transferred to a long term care hospital',
    24: 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare',
}

admission_source_map = {
    1: 'Physician Referral',
    2: 'Clinic Referral',
    3: 'HMO Referral',
    4: 'Transfer from a hospital',
    5: 'Transfer from a Skilled Nursing Facility (SNF)',
    6: 'Transfer from another health care facility',
    7: 'Emergency Room',
    8: 'Court/Law Enforcement',
    10: 'Transfer from critical access hospital',
    11: 'Normal Delivery',
    12: 'Premature Delivery',
    13: 'Sick Baby',
    14: 'Extramural Birth',
}

df['admission_type_id'] = df['admission_type_id'].replace(admission_type_map)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(discharge_disposition_map)
df['admission_source_id'] = df['admission_source_id'].replace(admission_source_map)

In [135]:
# Adding some important features
df['total_visits'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']
df['missing_diag'] = ((df['diag_1'] == '?') | (df['diag_2'] == '?') | (df['diag_3'] == '?')).astype(int)

# List of medication columns
meds_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide',
    'metformin-pioglitazone', 'metformin-rosiglitazone', 'glimepiride-pioglitazone',
    'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide'
]

# Create new binary columns: 1 if med was changed (up or down), 0 if no change or steady (Some encoding)
for col in meds_cols:
    new_col = f"{col}_new"
    df[new_col] = df[col].apply(lambda x: 0 if x in ['No', 'Steady'] else 1)

# Sum up all the med changes to create a 'med_change' feature
df['med_change'] = df[[f"{col}_new" for col in meds_cols]].sum(axis=1)

# Drop the temporary binary columns
df.drop(columns=[f"{col}_new" for col in meds_cols], inplace=True)

In [None]:
df['med_change'].value_counts()

In [None]:
# Recode medication columns: 'No' → 0, all others ('Steady', 'Up', 'Down') → 1
for col in meds_cols:
    df[col] = df[col].replace({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})

# Create 'num_med' feature: sum across the medication columns
df['num_med'] = df[meds_cols].sum(axis=1)

In [None]:
df['num_med'].value_counts()

In [140]:
# calculate the outcome variable readmission
df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

In [143]:
# Encode Age
df['age_num'] = df['age'].str.extract('(\d+)', expand=False).astype(int) + 5

In [None]:
df['age_num'].value_counts()

In [None]:
df.columns

In [150]:
df['high_A1C_flag'] = df['A1Cresult'].isin(['>7', '>8']).astype(int)

In [152]:
from sklearn.preprocessing import LabelEncoder

# Label Encode 'race', 'A1Cresult', 'max_glu_serum'
label_cols = ['race', 'A1Cresult']

# Initialize LabelEncoder
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df['diag_1'].value_counts()

In [165]:
# Safe numeric conversion first
df['diag_1_num'] = pd.to_numeric(df['diag_1'], errors='coerce')
df['diag_2_num'] = pd.to_numeric(df['diag_2'], errors='coerce')
df['diag_3_num'] = pd.to_numeric(df['diag_3'], errors='coerce')

# Initialize new columns
df['diag_1_code'] = 0
df['diag_2_code'] = 0
df['diag_3_code'] = 0

# Mapping for diag_1
df.loc[df['diag_1'].astype(str).str.startswith('V'), 'diag_1_code'] = 1
df.loc[df['diag_1'].astype(str).str.startswith('E'), 'diag_1_code'] = 2
df.loc[((df['diag_1_num'] >= 390) & (df['diag_1_num'] < 460)) | (df['diag_1_num'] == 785), 'diag_1_code'] = 3
df.loc[((df['diag_1_num'] >= 460) & (df['diag_1_num'] < 520)) | (df['diag_1_num'] == 786), 'diag_1_code'] = 4
df.loc[((df['diag_1_num'] >= 520) & (df['diag_1_num'] < 580)) | (df['diag_1_num'] == 787), 'diag_1_code'] = 5
df.loc[(df['diag_1_num'] >= 250) & (df['diag_1_num'] < 251), 'diag_1_code'] = 6
df.loc[(df['diag_1_num'] >= 800) & (df['diag_1_num'] < 1000), 'diag_1_code'] = 7
df.loc[(df['diag_1_num'] >= 710) & (df['diag_1_num'] < 740), 'diag_1_code'] = 8
df.loc[((df['diag_1_num'] >= 580) & (df['diag_1_num'] < 630)) | (df['diag_1_num'] == 788), 'diag_1_code'] = 9
df.loc[(df['diag_1_num'] >= 140) & (df['diag_1_num'] < 240), 'diag_1_code'] = 10

# Mapping for diag_2
df.loc[df['diag_2'].astype(str).str.startswith('V'), 'diag_2_code'] = 1
df.loc[df['diag_2'].astype(str).str.startswith('E'), 'diag_2_code'] = 2
df.loc[((df['diag_2_num'] >= 390) & (df['diag_2_num'] < 460)) | (df['diag_2_num'] == 785), 'diag_2_code'] = 3
df.loc[((df['diag_2_num'] >= 460) & (df['diag_2_num'] < 520)) | (df['diag_2_num'] == 786), 'diag_2_code'] = 4
df.loc[((df['diag_2_num'] >= 520) & (df['diag_2_num'] < 580)) | (df['diag_2_num'] == 787), 'diag_2_code'] = 5
df.loc[(df['diag_2_num'] >= 250) & (df['diag_2_num'] < 251), 'diag_2_code'] = 6
df.loc[(df['diag_2_num'] >= 800) & (df['diag_2_num'] < 1000), 'diag_2_code'] = 7
df.loc[(df['diag_2_num'] >= 710) & (df['diag_2_num'] < 740), 'diag_2_code'] = 8
df.loc[((df['diag_2_num'] >= 580) & (df['diag_2_num'] < 630)) | (df['diag_2_num'] == 788), 'diag_2_code'] = 9
df.loc[(df['diag_2_num'] >= 140) & (df['diag_2_num'] < 240), 'diag_2_code'] = 10

# Mapping for diag_3
df.loc[df['diag_3'].astype(str).str.startswith('V'), 'diag_3_code'] = 1
df.loc[df['diag_3'].astype(str).str.startswith('E'), 'diag_3_code'] = 2
df.loc[((df['diag_3_num'] >= 390) & (df['diag_3_num'] < 460)) | (df['diag_3_num'] == 785), 'diag_3_code'] = 3
df.loc[((df['diag_3_num'] >= 460) & (df['diag_3_num'] < 520)) | (df['diag_3_num'] == 786), 'diag_3_code'] = 4
df.loc[((df['diag_3_num'] >= 520) & (df['diag_3_num'] < 580)) | (df['diag_3_num'] == 787), 'diag_3_code'] = 5
df.loc[(df['diag_3_num'] >= 250) & (df['diag_3_num'] < 251), 'diag_3_code'] = 6
df.loc[(df['diag_3_num'] >= 800) & (df['diag_3_num'] < 1000), 'diag_3_code'] = 7
df.loc[(df['diag_3_num'] >= 710) & (df['diag_3_num'] < 740), 'diag_3_code'] = 8
df.loc[((df['diag_3_num'] >= 580) & (df['diag_3_num'] < 630)) | (df['diag_3_num'] == 788), 'diag_3_code'] = 9
df.loc[(df['diag_3_num'] >= 140) & (df['diag_3_num'] < 240), 'diag_3_code'] = 10

# Optional: Drop intermediate numeric columns if you want
df.drop(['diag_1_num', 'diag_2_num', 'diag_3_num'], axis=1, inplace=True)

In [None]:
df['diag_1_code'].value_counts()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
num_col = list(set(list(df._get_numeric_data().columns))- {'readmitted'})
num_col

In [175]:
def standardize(data):
    return ((data - np.mean(data, axis = 0)) / np.std(data, axis = 0))
# num_col is a list of all numeric features
df[num_col] = standardize(df[num_col])

In [177]:
# Handle outliers for all numerical columns

for col in num_col:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Clip the outliers
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

In [None]:
df.shape

In [None]:
df.columns

In [187]:
# Columns that are categorical (object type) and need encoding
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Apply LabelEncoder to each categorical column
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

In [None]:
df['readmitted'].value_counts()

In [None]:
df.isnull().sum() # high_A1C_flag, metformin-rosiglitazone

In [192]:
df = df.drop(['metformin-rosiglitazone', 'high_A1C_flag'], axis = 1)

In [198]:
# Test just see if all of the above came with profit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Split features and target
x = df.drop('readmitted', axis=1)
y = df['readmitted']

# Stratified train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=0)

# Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(x_train, y_train)

# Predict on test set
y_pred = model.predict(x_test)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print all results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)

Accuracy: 0.6572174335316255
Precision: 0.17575966850828728
Recall: 0.5383395029085141
F1 Score: 0.26500065078745283
Confusion Matrix:
 [[9809 4774]
 [ 873 1018]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.67      0.78     14583
           1       0.18      0.54      0.27      1891

    accuracy                           0.66     16474
   macro avg       0.55      0.61      0.52     16474
weighted avg       0.83      0.66      0.72     16474



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
