In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("datasets/KaggleV2-May-2016.csv")
print("First 5 rows of the dataset:")
print(df.head(), "\n")

First 5 rows of the dataset:
      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0           0        0             0      No  
1        

In [3]:
print("Missing values per column:")
print(df.isna().sum(), "\n")
df = df.dropna()
print("Shape after dropping missing values:", df.shape, "\n")

Missing values per column:
PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64 

Shape after dropping missing values: (110527, 14) 



In [4]:
feature_cols = ["Gender", "Age", "Scholarship", "Hipertension", "Diabetes", "Alcoholism", "Handcap", "SMS_received"]
df["No-show"] = df["No-show"].map({"No": 0, "Yes": 1})
X = df[feature_cols].copy()
y = df["No-show"].copy()
print("Feature sample:")
print(X.head(), "\n")
print("Target sample:")
print(y.head(), "\n")

Feature sample:
  Gender  Age  Scholarship  Hipertension  Diabetes  Alcoholism  Handcap  \
0      F   62            0             1         0           0        0   
1      M   56            0             0         0           0        0   
2      F   62            0             0         0           0        0   
3      F    8            0             0         0           0        0   
4      F   56            0             1         1           0        0   

   SMS_received  
0             0  
1             0  
2             0  
3             0  
4             0   

Target sample:
0    0
1    0
2    0
3    0
4    0
Name: No-show, dtype: int64 



In [5]:
numeric_features = ["Age", "Scholarship", "Hipertension", "Diabetes", "Alcoholism", "Handcap", "SMS_received"]
categorical_features = ["Gender"]
preprocessor = ColumnTransformer(transformers=[("num", StandardScaler(), numeric_features), ("cat", OneHotEncoder(drop="first"), categorical_features),])

In [6]:
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print("Train shape:", X_train_full.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape, "\n")

Train shape: (88421, 8)
Validation shape: (11053, 8)
Test shape: (11053, 8) 



In [None]:
kernels = ["linear", "rbf", "poly", "sigmoid"]
best_kernel = None
best_val_accuracy_svm = 0.0
svm_val_results = {}
for k in kernels:
    svm_clf = Pipeline(steps=[("preprocess", preprocessor),("model", SVC(kernel=k, C=1.0, random_state=42))])
    svm_clf.fit(X_train_full, y_train_full)
    y_val_pred_svm = svm_clf.predict(X_val)
    val_acc_svm = accuracy_score(y_val, y_val_pred_svm)
    svm_val_results[k] = val_acc_svm
    print(f"SVM - Kernel = {k}, Validation Accuracy = {val_acc_svm:.4f}")
    if val_acc_svm > best_val_accuracy_svm:
        best_val_accuracy_svm = val_acc_svm
        best_kernel = k
print("\nBest SVM kernel based on validation:", best_kernel)
print("Best SVM validation accuracy:", best_val_accuracy_svm, "\n")

In [None]:
X_train_final = pd.concat([X_train_full, X_val])
y_train_final = pd.concat([y_train_full, y_val])
svm_final = Pipeline(steps=[("preprocess", preprocessor),("model", SVC(kernel="linear", C=1.0, random_state=42))])
svm_final.fit(X_train_final, y_train_final)
y_test_pred_svm = svm_final.predict(X_test)
test_acc_svm = accuracy_score(y_test, y_test_pred_svm)
cm_svm = confusion_matrix(y_test, y_test_pred_svm)
print("=== SVM Final Model (Test Set) ===")
print(f"Test Accuracy: {test_acc_svm:.4f}")
print("Confusion Matrix (rows = true, cols = predicted):")
print(cm_svm, "\n")

In [7]:
criteria = ["gini", "entropy", "log_loss"]
best_criterion = None
best_val_accuracy = 0.0
results_dt = {}

for crit in criteria:
    # Build a pipeline: preprocessing + classifier
    dt_clf = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", DecisionTreeClassifier(criterion=crit, random_state=42))
    ])

    # Train on training set
    dt_clf.fit(X_train_full, y_train_full)

    # Validate on validation set
    y_val_pred = dt_clf.predict(X_val)
    val_acc = accuracy_score(y_val, y_val_pred)
    results_dt[crit] = val_acc

    print(f"Decision Tree - Criterion = {crit}, Validation Accuracy = {val_acc:.4f}")

    # Track best criterion
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        best_criterion = crit

print("\nBest decision tree criterion based on validation:", best_criterion)
print("Best validation accuracy:", best_val_accuracy, "\n")

# Retrain the final Decision Tree model on TRAIN + VALIDATION using best criterion
X_train_final = pd.concat([X_train_full, X_val])
y_train_final = pd.concat([y_train_full, y_val])

dt_final = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", DecisionTreeClassifier(criterion=best_criterion, random_state=42))
])

dt_final.fit(X_train_final, y_train_final)

# Evaluate on the test set
y_test_pred_dt = dt_final.predict(X_test)
test_acc_dt = accuracy_score(y_test, y_test_pred_dt)
cm_dt = confusion_matrix(y_test, y_test_pred_dt)

print("=== Decision Tree Final Model (Test Set) ===")
print(f"Test Accuracy: {test_acc_dt:.4f}")
print("Confusion Matrix (rows = true, cols = predicted):")
print(cm_dt, "\n")

Decision Tree - Criterion = gini, Validation Accuracy = 0.7946
Decision Tree - Criterion = entropy, Validation Accuracy = 0.7949
Decision Tree - Criterion = log_loss, Validation Accuracy = 0.7949

Best decision tree criterion based on validation: entropy
Best validation accuracy: 0.7948973129467113 

=== Decision Tree Final Model (Test Set) ===
Test Accuracy: 0.7953
Confusion Matrix (rows = true, cols = predicted):
[[8765   56]
 [2206   26]] 



In [8]:
n_estimators_list = [50, 100, 200, 500]
rf_val_results = {}

best_rf_n = None
best_rf_val_accuracy = 0.0

for n in n_estimators_list:
    rf_clf = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(
            n_estimators=n,
            random_state=42,
            n_jobs=-1
        ))
    ])

    # Train on training set
    rf_clf.fit(X_train_full, y_train_full)

    # Validate
    y_val_pred_rf = rf_clf.predict(X_val)
    val_acc_rf = accuracy_score(y_val, y_val_pred_rf)
    rf_val_results[n] = val_acc_rf

    print(f"Random Forest - n_estimators = {n}, Validation Accuracy = {val_acc_rf:.4f}")

    # Track best
    if val_acc_rf > best_rf_val_accuracy:
        best_rf_val_accuracy = val_acc_rf
        best_rf_n = n

print("\nBest Random Forest n_estimators based on validation:", best_rf_n)
print("Best validation accuracy:", best_rf_val_accuracy, "\n")
rf_final = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=best_rf_n,
        random_state=42,
        n_jobs=-1
    ))
])

rf_final.fit(X_train_final, y_train_final)

# Evaluate on the test set
y_test_pred_rf = rf_final.predict(X_test)
test_acc_rf = accuracy_score(y_test, y_test_pred_rf)
cm_rf = confusion_matrix(y_test, y_test_pred_rf)

print("=== Random Forest Final Model (Test Set) ===")
print(f"Test Accuracy: {test_acc_rf:.4f}")
print("Confusion Matrix (rows = true, cols = predicted):")
print(cm_rf, "\n")

Random Forest - n_estimators = 50, Validation Accuracy = 0.7934
Random Forest - n_estimators = 100, Validation Accuracy = 0.7940
Random Forest - n_estimators = 200, Validation Accuracy = 0.7940
Random Forest - n_estimators = 500, Validation Accuracy = 0.7943

Best Random Forest n_estimators based on validation: 500
Best validation accuracy: 0.7942640007237854 

=== Random Forest Final Model (Test Set) ===
Test Accuracy: 0.7952
Confusion Matrix (rows = true, cols = predicted):
[[8758   63]
 [2201   31]] 

