# VCA CODE

In [95]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("data csv with titles.csv")

## Data cleaning

In [None]:
def clean_data(df):
    # Columns to drop
    columns_to_drop = [
        'WGT_KG_TCR', 'HGT_CM_TCR', 'BMI_TCR', 'PERM_STATE_TRR', 'PRI_PAYMENT_TRR', 'PRI_PAYMENT_CTRY_TRR',
        'PX_STAT_TRR', 'PX_STAT_DATE_TRR', 'COD', 'PX_STAT_DATE', 'HBV_CORE_DON', 'HBV_SUR_ANTIGEN_DON',
        'HCV_ANTIBODY_DON', 'HBV_NAT_DON', 'HCV_NAT_DON', 'HIV_NAT_DON', 'LIV_DON_TY', 'NON_HRT_DON', 
        'TXHRT', 'TXLNG', 'VCA_TY_MULTI', 'REGION', 'PREV_TX_ANY_N', 'PREV_TX', 'RETXDATE', 'INO_PROCURE_AGENT_3', 
        'HBSAB_DON', 'EBV_IGG_CAD_DON', 'EBV_IGM_CAD_DON', 'ECD_DONOR', 'CDC_RISK_HIV_DON', 'RECOV_COUNTRY_DON', 
        'RECOV_OUT_US_DON', 'CONTROLLED_DON', 'REFERRAL_DATE', 'RESUSCIT_DUR_DON', 'EDUCATION_DON', 
        'LT_ONE_WEEK_DON', 'ADMIT_DATE_DON', 'DONOR_ID', 'VCA_CANDIDATE_ID_CODE', 'EDUCATION', 'WORK_INCOME', 
        'WORK_YES_STATUS', 'WORK_NO_STATUS', 'GRANT_FUNDING', 'INSTITUTIONAL_FUNDING', 'SECONDARY_PAY', 
        'UPPER_LIMB_LEFT_AMP_LEVEL', 'UPPER_LIMB_RIGHT_AMP_LEVEL', 'HOSP_90_DAYS', 'VENTILATOR', 'OTH_LIFE_SUP', 
        'HBV_CORE', 'HBV_SUR_ANTIGEN', 'HCV_SEROSTATUS', 'PREV_PREG', 'MALIG', 'MALIG_TY_SKINMELANOMA', 
        'MALIG_TY_SKINNONMELANOMA', 'MALIG_TY_CNSTUMOR', 'MALIG_TY_GENITOURINARY', 'MALIG_TY_BREAST', 
        'MALIG_TY_THYROID', 'MALIG_TY_TONGUETHROATLARYNX', 'MALIG_TY_LUNG', 'MALIG_TY_LEUKEMIALYMPHOMA', 
        'MALIG_TY_LIVER', 'MALIG_TY_HEPATOCELLULARCARCINOMA', 'MALIG_TY_OTHERSPECIFY', 'COAGULOPATHIES', 
        'COGNITIVE_DEV', 'MOTOR_DEV', 'DASH_SCORE', 'CARROLL_SCORE_LEFT', 'CARROLL_SCORE_RIGHT', 
        'PHYSICAL_FUNC_SCORE', 'ROLE_PHYS_SCORE', 'BODILY_PAIN_SCORE', 'GENERAL_HEALTH_SCORE', 'VITALITY_SCORE', 
        'SOCIAL_FUNC_SCORE', 'ROLE_EMOTIONAL_SCORE', 'MENTAL_HEALTH_SCORE', 'WARM_ISCH_TM_LEFT', 
        'COLD_ISCH_TM_LEFT', 'WARM_ISCH_TM_RIGHT', 'COLD_ISCH_TM_RIGHT', 'ISCHEMIA_LEFT', 'ISCHEMIA_RIGHT', 
        'DISCH_SERUM_CREAT', 'DISCH_HEMOGLOBIN_A1C', 'CRANIOFACIAL_AMT_TISSUE_LOSS', 'WARM_ISCH_TM', 
        'COLD_ISCH_TM', 'ISCHEMIA', 'ABDOMINAL_WALL', 'LOWER_LIMB_LEFT_AMP_LEVEL', 'LOWER_LIMB_RIGHT_AMP_LEVEL', 
        'INIT_DATE', 'REM_CD', 'DEATH_DATE', 'END_DATE', 'COD_WL', 'INIT_HGT_CM', 'INIT_WGT_KG', 'INIT_AGE', 
        'DAYSWAIT_CHRON', 'INIT_REGION', 'INIT_CPRA', 'SKIN_TY', 'WLPI', 'WLPA', 'WLKP', 'WLKI', 'WLLI', 'WLIN', 
        'WLLU', 'WLHR', 'WLHL', 'LIST_YEAR', 'WL_ID_CODE', 'VCA_TY_CD', 'ETHNICITY', 'BODY_PART', 
        'TRANSPLANT_TIME', 'TRANSPLANTTIMEZONEID', 'DATA_TRANSPLANT', 'DATA_WAITLIST', 'CTR_CODE', 
        'OPO_CTR_CODE', 'INIT_OPO_CTR_CODE', 'END_OPO_CTR_CODE', 'LISTING_CTR_CODE', 'FORM_STATUS', 
        'FORM_VALID_DT', 'TX_YEAR', 'PT_CODE', 'SHARE_TY',
        # Newly added columns to drop
        'RA1', 'RA2', 'RB1', 'RB2', 'RDR1', 'RDR2', 'CPRA_TX', 'END_CPRA', 'ABO_MAT', 
        'HLAMIS', 'AMIS', 'BMIS', 'DRMIS', 'COMPOSITE_DEATH_DATE', 'HEMOGLOBIN_A1C', 
        'DONOR_CROSSMATCH', 'LIFE_SUP_TRR', 'MED_COND_TRR', 'TX_DATE', 'RECOVERY_DATE_DON', 'ADMISSION_DATE', 'DISCHARGE_DATE', 'PX_STAT'
    ]

    df.drop(columns=columns_to_drop, inplace=True)

    df.replace(".", np.nan, inplace=True)

    df['PTIME'] = pd.to_numeric(df['PTIME'], errors='coerce')
    df['5_year'] = df['PTIME'] >= 1825 # It also presents false when dealing with missing PTIME values, so delete the missing rows.

    float32_columns = [
        'BMI_RECIP', 'HGT_CM_TRR', 'WGT_KG_TRR', 'PTIME', 'DISTANCE', 'BMI_DON_CALC', 
        'WGT_KG_DON_CALC', 'HGT_CM_DON_CALC'
    ]

    float16_columns = ['AGE', 'AGE_DON', 'SERUM_CREAT']

    # Convert column types
    df[float32_columns] = df[float32_columns].astype('float32')
    df[float16_columns] = df[float16_columns].astype('float16')

    # Convert DA1, DA2, DB1, DB2, DDR1, DDR2 to float
    float_columns = ['DA1', 'DA2', 'DB1', 'DB2', 'DDR1', 'DDR2']
    df[float_columns] = df[float_columns].apply(pd.to_numeric, errors='coerce', downcast='integer')
    float_imputer = SimpleImputer(strategy='most_frequent')
    df[float_columns] = float_imputer.fit_transform(df[float_columns])

    # Identify columns for KNN imputation
    continuous_columns = ['HGT_CM_TRR', 'WGT_KG_TRR', 'AGE', 'WGT_KG_DON_CALC', 'HGT_CM_DON_CALC', 'SERUM_CREAT']
    categorical_columns = ['ABO', 'ABO_DON']

    # Impute categorical variables with the most frequent value
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

    # Apply KNN Imputation on continuous variables
    continuous_imputer = KNNImputer(n_neighbors=5)
    df[continuous_columns] = continuous_imputer.fit_transform(df[continuous_columns])

    df['BMI_RECIP'] = df['BMI_RECIP'].fillna(df['WGT_KG_TRR'] / ((df['HGT_CM_TRR'] / 100) ** 2))
    df['BMI_DON_CALC'] = df['BMI_DON_CALC'].fillna(df['WGT_KG_DON_CALC'] / ((df['HGT_CM_DON_CALC'] / 100) ** 2))


    df.loc[df['ETHCAT'] == 998, 'ETHCAT'] = 3  # Replaces 998 (the unknown variable) with 3 for simplicity
    df = df.fillna({'DIAG': '9'})  # Fills the diagnosis variable's missing values with "9" meaning none
    df = df.fillna({'DISTANCE': df['DISTANCE'].median()})  # Fills missing values in distance with the median
    df = df.fillna({'AGE': df['AGE'].median()})  # Fills missing values in age with the median
    df.loc[df['INO_PROCURE_AGENT_1'] == '999', 'INO_PROCURE_AGENT_1'] = '7'  # Replaces the 999 values with 7
    df.loc[df['INO_PROCURE_AGENT_2'] == '999', 'INO_PROCURE_AGENT_2'] = '7'  # Replaces the 999 values with 7
    df = df.fillna({'GENDER_DON': df['GENDER_DON'].ffill()})  # Fills missing values using forward propagation
    df = df.fillna({'ETHCAT_DON': df['ETHCAT_DON'].mode()[0]})  # Fills missing values with the mode
    df = df.fillna({'CMV_STATUS': '4'})  # Fills missing values in CMV_STATUS with 4
    df = df.fillna({'EBV_SEROSTATUS': '4'})  # Fills missing values in CMV_STATUS with 4
    df = df.fillna({'AGE_DON': df['AGE_DON'].median()})  # Fills missing values with the median
    
    fill_u_columns = [
        'ARGININE_DON', 'INSULIN_DON', 'TATTOOS_DON', 'PROTEIN_URINE_DON', 
        'INOTROP_SUPPORT_DON', 'TOLER_IND_TECH', 'PRE_TX_TXFUS', 'OTH_RISK_FACTORS'
    ]
    df.fillna({col: 'U' for col in fill_u_columns}, inplace=True)

    category_columns = [
        'ABO', 'ETHCAT', 'GENDER', 'DIAG', 
        'DON_TY', 'MULTIORG', 'TXINT', 'TXKID', 'TXLIV', 'TXPAN', 'TXVCA', 'MULTIVCA', 
        'PREV_TX_ANY', 'VCA_TY', 'PSTATUS', 'INO_PROCURE_AGENT_1', 'INO_PROCURE_AGENT_2',
        'ARGININE_DON', 'INSULIN_DON', 'TATTOOS_DON', 'PROTEIN_URINE_DON', 
        'CARDARREST_POSTNEURO_DON', 'INOTROP_SUPPORT_DON', 'ABO_DON', 'GENDER_DON', 
        'ETHCAT_DON', 'CMV_STATUS', 'EBV_SEROSTATUS', 'TOLER_IND_TECH', 'PRE_TX_TXFUS', 
        'OTH_RISK_FACTORS', 'EXTRA_ALLOGRAFT'
    ]
    df[category_columns] = df[category_columns].astype('category')

    # Ensure 'N' is a valid category before filling
    columns_to_fill_N = ['MULTIORG', 'TXINT', 'TXKID', 'TXLIV', 'TXPAN', 'TXVCA', 'MULTIVCA', 'CARDARREST_POSTNEURO_DON', 'EXTRA_ALLOGRAFT']
    for col in columns_to_fill_N:
        if 'N' not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories(['N'])

    # Fill missing values with 'N' for the specified columns
    df.fillna({col: 'N' for col in columns_to_fill_N}, inplace=True)

    # Ensure '0' is a valid category before filling
    columns_to_fill_zero = ['INO_PROCURE_AGENT_1', 'INO_PROCURE_AGENT_2']
    for col in columns_to_fill_zero:
        if '0' not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories(['0'])

    # Fill missing values with '0' for the specified columns
    df.fillna({col: '0' for col in columns_to_fill_zero}, inplace=True)

        # Binary encoding for Y/N variables
    binary_columns_yn = [
        'MULTIORG', 'TXVCA', 'MULTIVCA', 'PREV_TX_ANY', 'CARDARREST_POSTNEURO_DON', 'EXTRA_ALLOGRAFT'
    ]
    df[binary_columns_yn] = df[binary_columns_yn].map(lambda x: True if x == 'Y' else False)

    # Binary encoding for special binary variables
    binary_special_mappings = {
        'DON_TY': {'C': True, 'L': False},
        'TXINT': {'W': True, 'N': False},
        'TXKID': {'E': True, 'N': False},
        'TXLIV': {'W': True, 'N': False},
        'TXPAN': {'W': True, 'N': False},
        'PSTATUS': {1: True, 0: False},
        'GENDER_DON': {'M': True, 'F': False},
        'GENDER': {'M': True, 'F': False}
    }

    for col, mapping in binary_special_mappings.items():
        df[col] = df[col].map(mapping)

    # One-hot encoding for multi-category columns
    one_hot_columns = [
        'ABO', 'ETHCAT', 'DIAG', 'VCA_TY', 
        'INO_PROCURE_AGENT_1', 'INO_PROCURE_AGENT_2', 'ARGININE_DON', 'INSULIN_DON', 
        'TATTOOS_DON', 'PROTEIN_URINE_DON', 'INOTROP_SUPPORT_DON', 'ABO_DON', 'ETHCAT_DON', 
        'CMV_STATUS', 'EBV_SEROSTATUS', 'TOLER_IND_TECH', 'PRE_TX_TXFUS', 'OTH_RISK_FACTORS'
    ]

    df = pd.get_dummies(df, columns=one_hot_columns, drop_first=False)


    return df

# Apply the cleaning function to the DataFrame
df_clean = clean_data(df.copy())
df_clean.head()

In [387]:
rej = pd.read_csv("REJECTIONS.csv")

In [388]:
def clean_data(rej):
    # Rename column 'TRR ID' to 'TRR_ID_CODE'
    rej = rej.rename(columns={'TRR ID': 'TRR_ID_CODE'})
    return rej

rej = clean_data(rej.copy())
rej.head()

Unnamed: 0,TRR_ID_CODE,ORGAN,DID_REJECT
0,A795333,Scalp,N
1,A795335,Upper Limb Right,Y
2,A795337,Uterus,Y
3,A795340,Abdominal Wall,N
4,A795344,Penis,Y


In [389]:
df_merged = df_clean.merge(rej[["TRR_ID_CODE", "DID_REJECT"]], on="TRR_ID_CODE", how="right")

In [390]:
df_merged = df_merged.drop(columns="TRR_ID_CODE", axis=1)

In [391]:
def clean_data(df_merged):
    # Replace all instances of "Y" with "1" in column: 'DID_REJECT'
    df_merged['DID_REJECT'] = df_merged['DID_REJECT'].str.replace("Y", "1", case=False, regex=False)
    # Replace all instances of "N" with "0" in column: 'DID_REJECT'
    df_merged['DID_REJECT'] = df_merged['DID_REJECT'].str.replace("N", "0", case=False, regex=False)
    return df_merged

df_merged = clean_data(df_merged.copy())
df_merged.head()

df_merged = df_merged.astype({'GENDER': 'float64', 'BMI_RECIP': 'float64', 'HGT_CM_TRR': 'float64', 'WGT_KG_TRR': 'float64', 'AGE': 'float64', 'DA1': 'float64', 'DA2': 'float64', 'DB1': 'float64', 'DB2': 'float64', 'DDR1': 'float64', 'DDR2': 'float64', 'DON_TY': 'float64', 'MULTIORG': 'float64', 'TXINT': 'float64', 'TXKID': 'float64', 'TXLIV': 'float64', 'TXPAN': 'float64', 'TXVCA': 'float64', 'MULTIVCA': 'float64', 'PREV_TX_ANY': 'float64', 'PSTATUS': 'float64', 'PTIME': 'float64', 'DISTANCE': 'float64', 'BMI_DON_CALC': 'float64', 'WGT_KG_DON_CALC': 'float64', 'HGT_CM_DON_CALC': 'float64', 'CARDARREST_POSTNEURO_DON': 'float64', 'GENDER_DON': 'float64', 'AGE_DON': 'float64', 'SERUM_CREAT': 'float64', 'EXTRA_ALLOGRAFT': 'float64', '5_year': 'float64', 'ABO_A': 'float64', 'ABO_AB': 'float64', 'ABO_B': 'float64', 'ABO_O': 'float64', 'ETHCAT_1': 'float64', 'ETHCAT_2': 'float64', 'ETHCAT_3': 'float64', 'ETHCAT_4': 'float64', 'ETHCAT_5': 'float64', 'ETHCAT_7': 'float64', 'ETHCAT_9': 'float64', 'DIAG_1': 'float64', 'DIAG_2': 'float64', 'DIAG_3': 'float64', 'DIAG_4': 'float64', 'DIAG_5': 'float64', 'DIAG_6': 'float64', 'DIAG_8': 'float64', 'DIAG_9': 'float64', 'VCA_TY_Abdominal Wall': 'float64', 'VCA_TY_Face': 'float64', 'VCA_TY_Larynx': 'float64', 'VCA_TY_Penis': 'float64', 'VCA_TY_Scalp': 'float64', 'VCA_TY_Trachea': 'float64', 'VCA_TY_Upper Limb Bilateral': 'float64', 'VCA_TY_Upper Limb Left': 'float64', 'VCA_TY_Upper Limb Right': 'float64', 'VCA_TY_Uterus': 'float64', 'INO_PROCURE_AGENT_1_1': 'float64', 'INO_PROCURE_AGENT_1_2': 'float64', 'INO_PROCURE_AGENT_1_3': 'float64', 'INO_PROCURE_AGENT_1_4': 'float64', 'INO_PROCURE_AGENT_1_5': 'float64', 'INO_PROCURE_AGENT_1_7': 'float64', 'INO_PROCURE_AGENT_1_0': 'float64', 'INO_PROCURE_AGENT_2_1': 'float64', 'INO_PROCURE_AGENT_2_4': 'float64', 'INO_PROCURE_AGENT_2_5': 'float64', 'INO_PROCURE_AGENT_2_7': 'float64', 'INO_PROCURE_AGENT_2_0': 'float64', 'ARGININE_DON_N': 'float64', 'ARGININE_DON_U': 'float64', 'ARGININE_DON_Y': 'float64', 'INSULIN_DON_N': 'float64', 'INSULIN_DON_U': 'float64', 'INSULIN_DON_Y': 'float64', 'TATTOOS_DON_N': 'float64', 'TATTOOS_DON_U': 'float64', 'TATTOOS_DON_Y': 'float64', 'PROTEIN_URINE_DON_N': 'float64', 'PROTEIN_URINE_DON_U': 'float64', 'PROTEIN_URINE_DON_Y': 'float64', 'INOTROP_SUPPORT_DON_N': 'float64', 'INOTROP_SUPPORT_DON_U': 'float64', 'INOTROP_SUPPORT_DON_Y': 'float64', 'ABO_DON_A': 'float64', 'ABO_DON_A1': 'float64', 'ABO_DON_A2': 'float64', 'ABO_DON_AB': 'float64', 'ABO_DON_B': 'float64', 'ABO_DON_O': 'float64', 'ETHCAT_DON_1': 'float64', 'ETHCAT_DON_2': 'float64', 'ETHCAT_DON_4': 'float64', 'CMV_STATUS_1': 'float64', 'CMV_STATUS_2': 'float64', 'CMV_STATUS_4': 'float64', 'EBV_SEROSTATUS_1': 'float64', 'EBV_SEROSTATUS_2': 'float64', 'EBV_SEROSTATUS_4': 'float64', 'TOLER_IND_TECH_N': 'float64', 'TOLER_IND_TECH_U': 'float64', 'TOLER_IND_TECH_Y': 'float64', 'PRE_TX_TXFUS_N': 'float64', 'PRE_TX_TXFUS_U': 'float64', 'PRE_TX_TXFUS_Y': 'float64', 'OTH_RISK_FACTORS_N': 'float64', 'OTH_RISK_FACTORS_U': 'float64', 'OTH_RISK_FACTORS_Y': 'float64'})

In [392]:
result = df_merged.pop("DID_REJECT").astype("int64")
df_merged = df_merged.drop(columns=['PTIME', '5_year', 'HGT_CM_TRR', 'WGT_KG_TRR', 'HGT_CM_DON_CALC', 'WGT_KG_DON_CALC', 'PSTATUS'])

## Recursive feature elimination with Random Forests

In [398]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load your data
# X: Features, y: Target variable
X_train, X_test, y_train, y_test = train_test_split(df_merged, result, test_size=0.3, random_state=42)

# Initialize Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=5, max_leaf_nodes=2, random_state=42)

# Initialize RFE with the Random Forest as the estimator
rfe = RFE(estimator=rf, n_features_to_select=25)  # You can specify the number of features to select

# Fit RFE
rfe.fit(X_train, y_train)

# Check which features were selected
selected_features = X_train.columns[rfe.support_]
print(f"Selected features: {selected_features}")

# Transform the dataset to the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train the Random Forest on the selected features
rf.fit(X_train_rfe, y_train)

# Predict on the test set
y_pred = rf.predict(X_test_rfe)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with selected features: {accuracy}")

cv_scores = cross_val_score(rf, X_train_rfe, y_train, cv=5, scoring='accuracy')
print(f"5-Fold Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

print(rf.score(X_train_rfe, y_train))
print(rf.score(X_test_rfe, y_test))

preds = rf.predict(X_test_rfe)

roc_auc = roc_auc_score(y_test, preds)
print(f"ROC_AUC: {roc_auc}")


Selected features: Index(['DON_TY', 'DISTANCE', 'AGE_DON', 'DIAG_1', 'DIAG_5',
       'VCA_TY_Upper Limb Left', 'VCA_TY_Uterus', 'TATTOOS_DON_N',
       'TATTOOS_DON_U', 'PROTEIN_URINE_DON_N', 'PROTEIN_URINE_DON_U',
       'INOTROP_SUPPORT_DON_N', 'ETHCAT_DON_1', 'ETHCAT_DON_4',
       'EBV_SEROSTATUS_2', 'EBV_SEROSTATUS_4', 'TOLER_IND_TECH_N',
       'TOLER_IND_TECH_U', 'TOLER_IND_TECH_Y', 'PRE_TX_TXFUS_N',
       'PRE_TX_TXFUS_U', 'PRE_TX_TXFUS_Y', 'OTH_RISK_FACTORS_N',
       'OTH_RISK_FACTORS_U', 'OTH_RISK_FACTORS_Y'],
      dtype='object')
Accuracy with selected features: 0.6428571428571429
5-Fold Cross-Validation Accuracy: 0.69 ± 0.10
0.7230769230769231
0.6428571428571429
ROC_AUC: 0.65625


## Random Forests la7alo

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, LeaveOneOut


X_train, X_test, y_train, y_test = train_test_split(df_merged, result, test_size=0.3, random_state=42)

rsf = RandomForestClassifier(n_estimators=500, max_depth=6, max_leaf_nodes=2, random_state=42)
rsf.fit(X_train, y_train)

print(rsf.score(X_train, y_train))
print(rsf.score(X_test, y_test))

preds = rsf.predict(X_test)

roc_auc = roc_auc_score(y_test, preds)
print(f"ROC_AUC: {roc_auc}")

cv_scores = cross_val_score(rsf, X_train, y_train, cv=5, scoring='accuracy')
print(f"5-Fold Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")



## Ma zabat burrito

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.model_selection import train_test_split

# Sample data: replace X and y with your dataset
X_train, X_test, y_train, y_test = train_test_split(df_merged, result, test_size=0.3, random_state=42)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500, random_state=42)

# Initialize Boruta feature selection method
boruta_selector = BorutaPy(rf, n_estimators='auto', random_state=42)

# Fit Boruta on the training data
boruta_selector.fit(X_train.values, y_train)

# Check selected features
selected_features = X_train.columns[boruta_selector.support_].tolist()
print(f"Selected Features: {selected_features}")

# Check the ranking of features
ranking_features = X_train.columns[boruta_selector.ranking_].tolist()
print(f"Ranking of Features: {ranking_features}")

# Transform training data to only include selected features
X_train_boruta = boruta_selector.transform(X_train.values)
X_test_boruta = boruta_selector.transform(X_test.values)

# Fit Random Forest on the selected features
rf.fit(X_train_boruta, y_train)

# Evaluate the model
accuracy = rf.score(X_test_boruta, y_test)
print(f"Accuracy after Boruta Feature Selection: {accuracy}")


## Feature importance for random forests

In [None]:
feature_names = df_merged.columns
# Get feature importances
importances = rsf.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Calculate Relative Variable Importance (RVI)
feature_importance_df['RVI'] = feature_importance_df['Importance'] / feature_importance_df['Importance'].sum()

# Calculate Cumulative RVI
feature_importance_df['Cumulative RVI'] = feature_importance_df['RVI'].cumsum()

# Sort by RVI in descending order
feature_importance_df = feature_importance_df.sort_values(by='RVI', ascending=False)

# Display the feature importance DataFrame with RVI and Cumulative RVI

# Display the top contributing variables
print(feature_importance_df.head(30))


## Optuna for random forests

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 20)
    
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                 min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_leaf,
                                 max_features=max_features,
                                 bootstrap=bootstrap,
                                 max_leaf_nodes=max_leaf_nodes)
    
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=5, scoring='roc_auc').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# [I 2024-10-07 14:31:09,420] Trial 56 finished with value: 0.6923076923076923 and parameters: {'n_estimators': 262, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 15, 'max_features': 'log2', 'bootstrap': False, 'max_leaf_nodes': 6}. Best is trial 56 with value: 0.6923076923076923.

# [I 2024-10-07 14:43:13,706] Trial 77 finished with value: 0.7523809523809524 and parameters: {'n_estimators': 450, 'max_depth': 27, 'min_samples_split': 15, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'bootstrap': False, 'max_leaf_nodes': 5}. Best is trial 77 with value: 0.7523809523809524.


## Optuna for XGB

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Hyperparameter suggestions
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    gamma = trial.suggest_float('gamma', 0, 5)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    scale_pos_weight = trial.suggest_float('scale_pos_weight', 1.0, 10.0)
    
    # Define the classifier with the hyperparameters
    clf = XGBClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        learning_rate=learning_rate,
        gamma=gamma, 
        min_child_weight=min_child_weight,
        subsample=subsample, 
        colsample_bytree=colsample_bytree,
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False, # To suppress warnings from XGB
        eval_metric='logloss'    # To suppress eval_metric warning
    )
    
    # Use cross_val_score to calculate the average AUC across 5-fold cross-validation
    score = cross_val_score(clf, X_train, y_train, scoring='accuracy', n_jobs=-1, cv=5).mean()
    return score

# Create the Optuna study to maximize the AUC
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# Best trial
print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

# Best hyperparameters: {'n_estimators': 666, 'max_depth': 7, 'learning_rate': 0.20371850795302843, 'gamma': 3.821025018002871, 'min_child_weight': 5, 'subsample': 0.7209854517618058, 'colsample_bytree': 0.5587085908542856, 'scale_pos_weight': 1.0009047312384416}

## XGB msh shaghal

In [386]:
# Define the classifier with the hyperparameters
Best_hyperparameters =  {'n_estimators': 666, 'max_depth': 7, 'learning_rate': 0.20371850795302843, 'gamma': 3.821025018002871, 
                         'min_child_weight': 5, 'subsample': 0.7209854517618058, 'colsample_bytree': 0.5587085908542856, 
                         'scale_pos_weight': 1.0009047312384416}


xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.01,
    max_depth=7,
  # Add L2 regularization
)


xgb_model.fit(X_train_rfe, y_train)

xgb_model.score(X_test_rfe, y_test)

score = cross_val_score(xgb_model, X_train_rfe, y_train, scoring='accuracy', n_jobs=-1, cv=5).mean()

print(score)

print(xgb_model.score(X_train_rfe, y_train))
print(xgb_model.score(X_test_rfe, y_test))

preds = xgb_model.predict(X_test_rfe)

roc_auc = roc_auc_score(y_test, preds)
print(f"ROC_AUC: {roc_auc}")


0.6307692307692307
0.8461538461538461
0.6428571428571429
ROC_AUC: 0.6458333333333333


## Logistic regression time !!!

In [411]:
from sklearn.linear_model import LogisticRegression

LR =  LogisticRegression(max_iter=1000, random_state=42)
LR.fit(X_train_rfe, y_train)
LR.score(X_train_rfe, y_train)

print(LR.score(X_train_rfe, y_train))
print(LR.score(X_test_rfe, y_test))

preds = LR.predict(X_test_rfe)

roc_auc = roc_auc_score(y_test, preds)
print(f"ROC_AUC: {roc_auc}")

cv_scores = cross_val_score(LR, X_train_rfe, y_train, cv=5, scoring='accuracy')
print(f"5-Fold Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

0.7846153846153846
0.7142857142857143
ROC_AUC: 0.7291666666666667
5-Fold Cross-Validation Accuracy: 0.68 ± 0.09


## Stacking time !!!

In [420]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate, StratifiedKFold



# Initialize base models
rf = RandomForestClassifier(n_estimators=100, max_depth=5, max_leaf_nodes=2, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.01, max_depth=7, random_state=42)
LR = LogisticRegression(max_iter=1000, random_state=42)
Support = SVC(kernel='linear', probability=True, random_state=42)

# Initialize StackingClassifier with the base models
stacked_model = StackingClassifier(
    estimators=[
        ('SVC', Support),
        ('rf', rf),
        ('xgb', xgb_model),
        ('lr', LR)
    ],
    final_estimator=LogisticRegression(max_iter=1000)
)

# Fit the Stacking Classifier
stacked_model.fit(X_train_rfe, y_train)

# Predictions on the test set
y_pred = stacked_model.predict(X_test_rfe)

# Accuracy
stacked_accuracy = accuracy_score(y_test, y_pred)
print(f"Stacked Model Accuracy: {stacked_accuracy:.2f}")

# ROC AUC Score
stacked_roc_auc = roc_auc_score(y_test, y_pred)
print(f"Stacked Model ROC_AUC: {stacked_roc_auc:.2f}")

# 5-Fold Cross-Validation Accuracy
cv_scores = cross_val_score(stacked_model, X_train_rfe, y_train, cv=3, scoring='accuracy')
print(f"5-Fold Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

# Additional evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Extracting values from the confusion matrix
TN, FP, FN, TP = cm.ravel()

# Sensitivity (Recall)
sensitivity = TP / (TP + FN)
print(f"Sensitivity (Recall): {sensitivity:.2f}")

# Specificity
specificity = TN / (TN + FP)
print(f"Specificity: {specificity:.2f}")

Stacked Model Accuracy: 0.71
Stacked Model ROC_AUC: 0.72
5-Fold Cross-Validation Accuracy: 0.69 ± 0.04
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.69      0.73        16
           1       0.64      0.75      0.69        12

    accuracy                           0.71        28
   macro avg       0.71      0.72      0.71        28
weighted avg       0.72      0.71      0.72        28

Confusion Matrix:
[[11  5]
 [ 3  9]]
Sensitivity (Recall): 0.75
Specificity: 0.69


## Models for imputing the missing PTIME values

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np


def evaluate_imputation_models(df, target_column='PTIME', top_features=20):  # Set top_features to 5
    # Step 1: Separate data with and without missing PTIME values
    df_not_missing = df[df[target_column].notna()]
    df_missing = df[df[target_column].isna()]
    
    # Step 2: Separate features and target
    X_train = df_not_missing.drop(columns=[target_column])
    y_train = df_not_missing[target_column]
    
    X_missing = df_missing.drop(columns=[target_column])
    
    # Step 3: Feature Scaling (MinMaxScaler)
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Step 4: Train a Random Forest to determine feature importance
    rf_selector_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_selector_model.fit(X_train_scaled, y_train)
    
    # Use SelectFromModel to select the top 5 most important features
    selector = SelectFromModel(rf_selector_model, max_features=top_features, prefit=True)
    
    # Get the selected feature names
    feature_names = X_train.columns  # Original feature names
    selected_features = feature_names[selector.get_support()]  # Get the selected feature names
    
    print("Top 5 selected features:")
    print(selected_features)

    # Select the same top features from the scaled training set
    X_train_scaled_selected = selector.transform(X_train_scaled)
    
    # Select the same top features from X_missing if needed (if you're imputing later)
    if not X_missing.empty:
        X_missing_scaled = scaler.transform(X_missing)
        X_missing_scaled_selected = selector.transform(X_missing_scaled)
    
    # Split train data for validation purposes
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled_selected, y_train, test_size=0.2, random_state=42)
    
    # Initialize a dictionary to store model predictions
    validation_predictions = {}

    # Step 5: Train and validate models with the top 5 features
    # 5.1 Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_split, y_train_split)
    rf_pred_val = rf_model.predict(X_val_split)
    
    # Calculate MAPE for Random Forest
    rf_mape = mean_absolute_percentage_error(y_val_split, rf_pred_val) * 100
    print("Random Forest Validation MAPE: {:.2f}%".format(rf_mape))
    validation_predictions['RandomForest'] = rf_pred_val

    # 5.2 XGBoost Regressor
    xgb_model = XGBRegressor(n_estimators=100, random_state=42)
    xgb_model.fit(X_train_split, y_train_split)
    xgb_pred_val = xgb_model.predict(X_val_split)
    
    # Calculate MAPE for XGBoost
    xgb_mape = mean_absolute_percentage_error(y_val_split, xgb_pred_val) * 100
    print("XGBoost Validation MAPE: {:.2f}%".format(xgb_mape))
    validation_predictions['XGBoost'] = xgb_pred_val

    # 5.3 MLP Regressor (Neural Network)
    mlp_model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
    mlp_model.fit(X_train_split, y_train_split)
    mlp_pred_val = mlp_model.predict(X_val_split)
    
    # Calculate MAPE for MLP
    mlp_mape = mean_absolute_percentage_error(y_val_split, mlp_pred_val) * 100
    print("MLP Validation MAPE: {:.2f}%".format(mlp_mape))
    validation_predictions['MLP'] = mlp_pred_val
    
    # Step 6: Choose the model with the best performance based on MAPE
    best_model_name = min(validation_predictions, key=lambda k: mean_absolute_percentage_error(y_val_split, validation_predictions[k]))
    
    print(f"The best model based on validation MAPE is: {best_model_name}")
    
    return df

# Apply the function to the cleaned DataFrame and select the top 5 features
evaluate_imputation_models(df_clean, top_features=20)