In [2]:
import pandas as pd
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix


In [3]:
df = pd.read_csv('/kaggle/input/hospital-readmissions/hospital_readmissions.csv')
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [4]:
df.shape

(25000, 17)

In [5]:
df.columns

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test',
       'A1Ctest', 'change', 'diabetes_med', 'readmitted'],
      dtype='object')

In [6]:
df.isnull().sum()

age                  0
time_in_hospital     0
n_lab_procedures     0
n_procedures         0
n_medications        0
n_outpatient         0
n_inpatient          0
n_emergency          0
medical_specialty    0
diag_1               0
diag_2               0
diag_3               0
glucose_test         0
A1Ctest              0
change               0
diabetes_med         0
readmitted           0
dtype: int64

In [9]:
# Define a function to clean and convert age ranges
def age_to_numeric(age_str):
    # Remove non-numeric characters and handle ranges
    if isinstance(age_str, str):
        # Remove any extraneous characters like parentheses
        age_str = re.sub(r'[^\d\-]', '', age_str)
        if '-' in age_str:
            start, end = age_str.split('-')
            return (int(start) + int(end)) / 2
        return float(age_str)
    return float(age_str)

# Apply the function to the 'age' column
df['age'] = df['age'].apply(age_to_numeric)

In [10]:
# One-hot encode categorical variables
categorical_columns = ['medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test', 'A1Ctest', 'change', 'diabetes_med']
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [11]:
# List of numerical columns
numerical_columns = ['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures', 'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency']


In [12]:

# Standardize numerical features
scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

In [13]:
# Define features and target
X = df_encoded.drop('readmitted', axis=1)
y = df_encoded['readmitted']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [14]:
# Initialize and train the model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred = model.predict(X_test)

In [18]:
# Evaluate the model
precision = precision_score(y_test, y_pred,pos_label='yes')
recall = recall_score(y_test, y_pred,pos_label='yes')
f1 = f1_score(y_test, y_pred,pos_label='yes')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Precision: 0.62
Recall: 0.41
F1 Score: 0.50
Classification Report:
              precision    recall  f1-score   support

          no       0.60      0.78      0.68      4000
         yes       0.62      0.41      0.50      3500

    accuracy                           0.61      7500
   macro avg       0.61      0.60      0.59      7500
weighted avg       0.61      0.61      0.60      7500

Confusion Matrix:
[[3130  870]
 [2055 1445]]
