In [2]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv("synthetic_blood_donation_data_multiselect.csv")

# Convert LastDonationDate to DaysSinceLastDonation
df['LastDonationDate'] = pd.to_datetime(df['LastDonationDate'])
today = pd.to_datetime(datetime.today().date())
df['DaysSinceLastDonation'] = (today - df['LastDonationDate']).dt.days
df.drop(columns=['LastDonationDate'], inplace=True)

# Define multiselect columns
multiselect_columns = ['ChronicIllnessDetails', 'MedicationDetails', 'VaccineDetails', 'AllergyDetails']

# Clean & split multiselect values
for col in multiselect_columns:
    df[col] = df[col].fillna("").apply(lambda x: [item.strip() for item in x.split(',') if item.strip() != ""])

# Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb_dfs = []

for col in multiselect_columns:
    binarized = pd.DataFrame(mlb.fit_transform(df[col]), columns=[f"{col}_{cls}" for cls in mlb.classes_])
    mlb_dfs.append(binarized)

# Drop original multiselect columns and concatenate new ones
df.drop(columns=multiselect_columns, inplace=True)
df = pd.concat([df] + mlb_dfs, axis=1)

# Encode other categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' and column != 'Eligible':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Split into features and target
X = df.drop(columns=['Eligible'])
y = df['Eligible']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
report


{'No': {'precision': 0.9888535031847133,
  'recall': 0.9957242116515232,
  'f1-score': 0.9922769640479361,
  'support': 1871.0},
 'Yes': {'precision': 0.9310344827586207,
  'recall': 0.8372093023255814,
  'f1-score': 0.8816326530612245,
  'support': 129.0},
 'accuracy': 0.9855,
 'macro avg': {'precision': 0.959943992971667,
  'recall': 0.9164667569885523,
  'f1-score': 0.9369548085545802,
  'support': 2000.0},
 'weighted avg': {'precision': 0.9851241763672304,
  'recall': 0.9855,
  'f1-score': 0.9851404059892932,
  'support': 2000.0}}

In [3]:
import joblib

mlb_dict = {}
for col, bin_df in zip(multiselect_columns, mlb_dfs):
    mlb_dict[col] = mlb  # This assumes you reuse the same MLB per field

# Save all artifacts
joblib.dump(model, 'BloodDonationEligibilityModel3.joblib')
joblib.dump(label_encoders, 'LabelEncoders3.joblib')
joblib.dump(mlb_dict, 'MultiLabelBinarizers3.joblib')


['MultiLabelBinarizers3.joblib']