# Exploring the Data 

### Part 1: Building up a basic predictive model 

### Data Cleaning and Transformation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

df = pd.read_csv('diabetic_data.csv')

In [None]:
df.info()

In [None]:
print("Shape of the data:", df.shape)

In [None]:
df.drop('encounter_id', axis=1, inplace=True)
df.drop('patient_nbr', axis=1, inplace=True) 

print(df)

In [None]:
df.head(30)
print(df.isnull().sum())

In [None]:
df.replace('?', np.nan, inplace=True)
df['A1Cresult'].replace('None', np.nan, inplace=True)
df['max_glu_serum'].replace('None', np.nan, inplace=True)

df['admission_type_id'] = df['admission_type_id'].astype(object)
df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(object)
df['admission_source_id'] = df['admission_source_id'].astype(object)

df.head()

In [None]:
print(df.isnull().sum())

In [None]:
print(df['readmitted'].value_counts())

# Replace values in the 'readmitted' column
df['readmitted'] = df['readmitted'].map({'<30': 1, '>30': 0, 'NO': 0})

# Verify the change
print(df['readmitted'].value_counts())


In [None]:
# Check the data type of each column
print(df.dtypes)

In [None]:
# Calculate percentage of missing values for each column
missing_percentages = df.isnull().mean() * 100
# Drop columns with more than 90% missing values
cols_to_drop = missing_percentages[missing_percentages > 90].index
df = df.drop(cols_to_drop, axis=1) 

df.head()

In [None]:
# Drop columns with no variations
df = df.drop(['examide', 'citoglipton'], axis=1)

# Drop near zero-variance columns
cols_to_drop = ['repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
                'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol',
                'troglitazone', 'tolazamide', 'glyburide-metformin',
                'glipizide-metformin', 'glimepiride-pioglitazone',
                'metformin-rosiglitazone', 'metformin-pioglitazone']
df = df.drop(cols_to_drop, axis=1) 

df.head()

In [None]:
# Drop rows with null values in the original DataFrame
df.dropna(axis=0, how='any', inplace=True)

In [None]:
# Show the number of rows in the dataset
num_rows = df.shape[0]
print("Number of rows in the dataset:", num_rows)


In [None]:
from scipy.stats import zscore

# Check column names in the DataFrame
print(df.columns)

# Exclude specific columns from non-numeric conversion
columns_to_exclude = ['patient_nbr','admission_id', 'admission_source_id', 'discharge_disposition_id', 'readmitted']

# Drop columns if they exist
df_numeric = df.drop(columns_to_exclude, axis=1, errors='ignore')

# Convert non-numeric columns to numeric
df_numeric = df_numeric.apply(pd.to_numeric, errors='coerce')

# Calculate Z-scores for numerical columns
z_scores = df_numeric.apply(zscore)

# Define threshold for outlier detection (e.g., Z-score greater than 5)
outliers = (z_scores.abs() > 5).any(axis=1)

# Remove outliers from the data
df = df[~outliers]

# Display summary statistics after removing outliers
print("Summary statistics after removing outliers:")
print(df.describe())


In [None]:
# Show the shape of the resulting dataframe
print(f"Shape of the cleaned data: {df.shape}")

## Data Visualisation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your cleaned DataFrame after the preprocessing steps
plt.figure(figsize=(8, 6))
sns.countplot(x='readmitted', data=df)
plt.title('Distribution of Readmission within 30 Days')
plt.xlabel('Readmitted')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Not Readmitted', 'Readmitted'])
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert 'readmitted' column to string type if needed
df['readmitted'] = df['readmitted'].astype(str)

# Plot the count of readmitted cases against age
plt.figure(figsize=(10, 6))
sns.countplot(x='age', hue='readmitted', data=df)
plt.title('Count of Readmitted Cases vs. Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.legend(title='Readmitted', loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(14, 10))
sns.countplot(x='num_medications', hue='readmitted', data=df)
plt.title('Count of Target Variable Against the Number of Medications')
plt.xlabel('Number of Medications')
plt.ylabel('Count')
plt.legend(title='Readmitted', labels=['Not Readmitted', 'Readmitted'])
plt.xticks(rotation=90)  # Depending on the number of unique medication counts, you might need to adjust rotation
plt.show()

In [None]:
# Filter out numerical columns
numerical_columns = df.select_dtypes(include=['number'])

# Scatter matrix plot
pd.plotting.scatter_matrix(numerical_columns, figsize=(20, 20))
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlation matrix
corr_matrix = df.corr()

# Plotting the heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
#Boxplot of Time in Hospital by Readmission Status 

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x='readmitted', y='time_in_hospital', data=df)
plt.title('Hospital Stay Length by Readmission Status')
plt.xlabel('Readmitted (0 = No, 1 = Yes)')
plt.ylabel('Time in Hospital (Days)')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.boxplot(x='readmitted', y='num_lab_procedures', data=df)
plt.title('Box Plot of Number of Lab Procedures by Readmission Status')
plt.xlabel('Readmitted (0 = No, 1 = Yes)')
plt.ylabel('Number of Lab Procedures')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='age', bins=20, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

## Model Building

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Separate predictors (X) and target variable (y)
X = df.drop(columns=['readmitted'])
y = df['readmitted']

# Convert categorical variables to numerical using LabelEncoder
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)

for column in X.select_dtypes(include=['object']).columns:
    if column != 'age':  # Exclude age from label encoding
        X[column] = label_encoder.fit_transform(X[column])

# Perform one-hot encoding for age
X['age'] = label_encoder.fit_transform(X['age'])

# Perform feature selection
# Combine ANOVA F-value for numerical and chi-square for categorical variables
selector = SelectKBest(score_func=lambda X, y: f_classif(X, y) if X.dtype == 'float64' or X.dtype == 'int64' else chi2(X, y), k=10)
X_selected = selector.fit_transform(X, y)

# Get selected feature indices
selected_indices = selector.get_support(indices=True)

# Get names of selected features
selected_features = X.columns[selected_indices]

print("Selected predictors:")
for feature in selected_features:
    print(feature)

In [None]:
df.dtypes

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Define selected predictors
selected_predictors = ['discharge_disposition_id', 'time_in_hospital', 'num_lab_procedures', 
                       'num_medications', 'number_emergency', 'number_inpatient', 
                       'number_diagnoses', 'metformin', 'glipizide', 'diabetesMed']

# Separate predictors (X) and target variable (y)
X = df[selected_predictors]
y = df['readmitted']

# Identify categorical and numerical columns
categorical_cols = ['discharge_disposition_id', 'metformin', 'glipizide', 'diabetesMed']
numerical_cols = ['time_in_hospital', 'num_lab_procedures', 'num_medications', 
                  'number_emergency', 'number_inpatient', 'number_diagnoses']

# Create the preprocessing pipelines for both numerical and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a pipeline that includes the preprocessor and the logistic regression model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))


In [None]:
print(X.columns)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Assuming X and y are already defined with the correct columns
# X, y = your_data_here()

# For demonstration, let's consider 'metformin', 'glipizide', 'diabetesMed' as categorical features needing encoding
# If they are already in a binary format (e.g., 0s and 1s or Yes/No), you can use pd.get_dummies to encode them
X_encoded = pd.get_dummies(X, columns=['metformin', 'glipizide', 'diabetesMed'], drop_first=True)

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Ensure y is an integer array
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Training the Logistic Regression model
model = LogisticRegression(solver='liblinear', max_iter=100)  # Adjust max_iter as needed based on convergence
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities of the positive class

# Model score (accuracy)
model_score = model.score(X_test, y_test)

# Compute performance metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_probs)
pr_auc = average_precision_score(y_test, y_probs)

# Print the computed metrics
print(f"Model Accuracy: {model_score:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Assuming 'X' and 'y' are defined and properly formatted
# Make sure 'y' is of integer type to avoid pos_label related errors
y = y.astype(int)

categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

model = GradientBoostingClassifier(random_state=42)

oversample_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('oversampler', SMOTE(random_state=42)),
    ('classifier', model)])

oversample_pipeline.fit(X_train, y_train)
y_pred_over = oversample_pipeline.predict(X_test)
y_probs_over = oversample_pipeline.predict_proba(X_test)[:, 1]

# Now, explicitly convert y_pred_over to int if it's not already
y_pred_over = y_pred_over.astype(int)

# Model score (accuracy)
model_score = oversample_pipeline.score(X_test, y_test)

print("\nOversampling Performance:")
precision = precision_score(y_test, y_pred_over)
recall = recall_score(y_test, y_pred_over)
f1 = f1_score(y_test, y_pred_over)
roc_auc = roc_auc_score(y_test, y_probs_over)
pr_auc = average_precision_score(y_test, y_probs_over)

print(f"Model Accuracy: {model_score:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the dataset
diabetic_df = pd.read_csv('diabetic_data.csv')

# Drop unnecessary columns
diabetic_df.drop(['encounter_id'], axis=1, inplace=True)
# Replace '?' with np.nan
diabetic_df.replace('?', np.nan, inplace=True)
# Convert 'readmitted' into binary (1 for <30, 0 for others)
diabetic_df['readmitted'] = diabetic_df['readmitted'].map({'<30': 1, '>30': 0, 'NO': 0})

# Separate features and target
X = diabetic_df.drop('readmitted', axis=1)
y = diabetic_df['readmitted']

# Imputation and Scaling for numeric features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Handling categorical data
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

# Assuming preprocessor and X are defined as before

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# Check if the preprocessed data is a sparse matrix
if hasattr(X_preprocessed, "toarray"):  # If X_preprocessed is a sparse matrix
    # Use TruncatedSVD for dimensionality reduction on sparse matrix
    svd = TruncatedSVD(n_components=2, random_state=12)
    X_reduced = svd.fit_transform(X_preprocessed)
else:
    # Use PCA for dimensionality reduction on dense matrix
    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(X_preprocessed)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=12)
clusters = kmeans.fit_predict(X_preprocessed)

# Visualize clusters
plt.figure(figsize=(8, 6))
for cluster_number in range(5):  # Assuming 5 clusters
    plt.scatter(X_reduced[clusters == cluster_number, 0], X_reduced[clusters == cluster_number, 1], label=f'Cluster {cluster_number}', alpha=0.5)
plt.title('K-Means Clustering with Dimensionality Reduction')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend()
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Assuming X_preprocessed, y, and clusters_train are defined as before

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Apply clustering only to the training set for avoiding data leakage
kmeans = KMeans(n_clusters=5, random_state=12)
clusters_train = kmeans.fit_predict(X_train)

cluster_models = {}
cluster_predictions = {}

# Create a model for each cluster in the training set
for cluster_label in np.unique(clusters_train):
    # Isolate data points and labels in the current cluster
    indices = clusters_train == cluster_label
    X_train_cluster = X_train[indices]
    y_train_cluster = y_train[indices]
    
    # Initialize and fit the RandomForest model
    model = RandomForestClassifier(random_state=12)
    model.fit(X_train_cluster, y_train_cluster)
    cluster_models[cluster_label] = model
    
    # Make predictions on the test set using the trained model
    X_test_transformed = kmeans.transform(X_test)  # Get distances to the cluster centers
    test_cluster_labels = np.argmin(X_test_transformed, axis=1)  # Assign to the closest cluster
    
    indices_test_cluster = test_cluster_labels == cluster_label
    if np.any(indices_test_cluster):  # Check if there are test instances in this cluster
        X_test_cluster = X_test[indices_test_cluster]
        y_pred_test = model.predict(X_test_cluster)
        cluster_predictions[cluster_label] = (indices_test_cluster, y_pred_test)

# Evaluate each cluster's model
for cluster_label, (indices_test_cluster, y_pred_test) in cluster_predictions.items():
    # Adjusting how you get the true labels for the test set based on indices
    y_test_cluster = y_test[indices_test_cluster]
    
    accuracy = accuracy_score(y_test_cluster, y_pred_test)
    print(f"Cluster {cluster_label} Model Accuracy: {accuracy}")


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=12)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Train a model on the balanced dataset
model_balanced = RandomForestClassifier(random_state=12)
model_balanced.fit(X_train_bal, y_train_bal)

# Predict and evaluate
y_pred_bal = model_balanced.predict(X_test)
print(classification_report(y_test, y_pred_bal))
