In [19]:
# First model SVM Classifier   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

cleaned_dt = pd.read_excel('C:\\Users\\pc\\Documents\\GitHub\\pediatric-appendicitis-diagnosis\\data\\cleaned_data.xlsx', engine='openpyxl')
cleaned_dt.fillna(0, inplace=True)
cleaned_dt['Diagnosis'].replace({'appendicitis' : 1, 'no appendicitis' : 0}, inplace=True)

cleaned_dt.columns = cleaned_dt.columns.str.replace(r'\s+', '_', regex=True)  
cleaned_dt.columns = cleaned_dt.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)  

X = cleaned_dt.drop(columns=['Diagnosis'])  
y = cleaned_dt['Diagnosis']


X = pd.get_dummies(X, drop_first=True)  


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normilizing the values 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_dt['Diagnosis'].replace({'appendicitis' : 1, 'no appendicitis' : 0}, inplace=True)
  cleaned_dt['Diagnosis'].replace({'appendicitis' : 1, 'no appendicitis' : 0}, inplace=True)


In [20]:
X

Unnamed: 0,Age,BMI,Height,Weight,Alvarado_Score,Paedriatic_Appendicitis_Score,Appendix_Diameter,Body_Temperature,WBC_Count,Neutrophil_Percentage,...,Loss_of_Appetite_yes,Dysuria_yes,Peritonitis_local,Peritonitis_no,Psoas_Sign_yes,Ipsilateral_Rebound_Tenderness_yes,US_Performed_yes,category_age_Entre 15 et 20 ans,category_age_Entre 5 et 10 ans,category_age_Moins de 5 ans
0,12.680000,16.900000,148.0,37.0,4.0,3.0,7.1,37.0,7.7,68.2,...,True,False,False,True,True,False,True,False,False,False
1,14.100000,31.900000,147.0,69.5,5.0,4.0,5.0,36.9,8.1,64.8,...,True,True,False,True,True,False,True,False,False,False
2,14.140000,23.300000,163.0,62.0,5.0,3.0,5.0,36.6,13.2,74.8,...,False,False,False,True,True,False,True,False,False,False
3,16.370000,20.600000,165.0,56.0,7.0,6.0,5.0,36.0,11.4,63.0,...,True,True,False,True,True,False,True,True,False,False
4,11.080000,16.900000,163.0,45.0,5.0,6.0,7.0,36.9,8.1,44.0,...,True,False,False,True,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,12.413415,25.250476,166.5,70.0,8.0,7.0,7.5,39.4,11.4,76.6,...,True,False,True,False,True,False,True,False,False,False
777,17.092402,20.429418,158.0,51.0,5.0,3.0,8.8,37.8,17.4,89.2,...,True,False,True,False,False,False,True,True,False,False
778,14.992471,19.909972,152.0,46.0,5.0,3.0,8.2,37.3,14.6,68.5,...,False,True,False,True,False,False,True,False,False,False
779,7.195072,14.295549,129.3,23.9,9.0,8.0,14.0,37.5,17.8,77.0,...,False,False,True,False,False,False,True,False,True,False


In [21]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1]

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("SVM ROC-AUC:", roc_auc_score(y_test, y_pred_proba_svm))


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91        65
           1       0.94      0.92      0.93        92

    accuracy                           0.92       157
   macro avg       0.92      0.92      0.92       157
weighted avg       0.92      0.92      0.92       157

SVM ROC-AUC: 0.9827759197324414


In [22]:
# Second model Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_pred_proba_rf))



Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94        65
           1       0.95      0.97      0.96        92

    accuracy                           0.95       157
   macro avg       0.95      0.95      0.95       157
weighted avg       0.95      0.95      0.95       157

Random Forest ROC-AUC: 0.9939799331103679


In [23]:
cleaned_dt.columns = cleaned_dt.columns.str.replace(r'\s+', '_', regex=True)  # Replace spaces with underscores
cleaned_dt.columns = cleaned_dt.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)  # Remove non-alphanumeric characters
print("Columns after cleaning:", cleaned_dt.columns)

Columns after cleaning: Index(['Age', 'BMI', 'Sex', 'Height', 'Weight', 'Diagnosis_Presumptive',
       'Diagnosis', 'Alvarado_Score', 'Paedriatic_Appendicitis_Score',
       'Appendix_on_US', 'Appendix_Diameter', 'Migratory_Pain',
       'Lower_Right_Abd_Pain', 'Contralateral_Rebound_Tenderness',
       'Coughing_Pain', 'Nausea', 'Loss_of_Appetite', 'Body_Temperature',
       'WBC_Count', 'Neutrophil_Percentage', 'RBC_Count', 'Hemoglobin', 'RDW',
       'Thrombocyte_Count', 'CRP', 'Dysuria', 'Peritonitis', 'Psoas_Sign',
       'Ipsilateral_Rebound_Tenderness', 'US_Performed', 'US_Number',
       'category_age'],
      dtype='object')


In [None]:
# Third model : LightGBM Classifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

categorical_columns = cleaned_dt.select_dtypes(include=['object']).columns

label_encoders = {}
for column in categorical_columns:
    if cleaned_dt[column].dtype == 'object':  
        encoder = LabelEncoder()
        cleaned_dt[column] = encoder.fit_transform(cleaned_dt[column].fillna('Unknown')) 
        label_encoders[column] = encoder

X = cleaned_dt.drop(columns=['Diagnosis'])
y = cleaned_dt['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)

lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)
y_pred_proba_lgb = lgb_model.predict_proba(X_test)[:, 1]

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred_lgb)
roc_auc = roc_auc_score(y_test, y_pred_proba_lgb)
precision = precision_score(y_test, y_pred_lgb)
recall = recall_score(y_test, y_pred_lgb)
f1 = f1_score(y_test, y_pred_lgb)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


[LightGBM] [Info] Number of positive: 372, number of negative: 252
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1776
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.596154 -> initscore=0.389465
[LightGBM] [Info] Start training from score 0.389465
Accuracy: 0.9618
ROC-AUC: 0.9957
Precision: 0.9674
Recall: 0.9674
F1-Score: 0.9674
