In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/noshowappointments/KaggleV2-May-2016.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("/kaggle/input/noshowappointments/KaggleV2-May-2016.csv")

# Encode target variable
df['No-show'] = df['No-show'].map({'Yes': 1, 'No': 0})

# Drop irrelevant features
df.drop(columns=['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'], inplace=True)

# Encode categorical variables
label_enc = LabelEncoder()
df['Gender'] = label_enc.fit_transform(df['Gender'])
df['Neighbourhood'] = label_enc.fit_transform(df['Neighbourhood'])

# Features and Target
X = df.drop('No-show', axis=1)
y = df['No-show']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


In [3]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from sklearn.utils.class_weight import compute_class_weight

# Store sampled data
sampling_methods = {}

# 1. Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
sampling_methods['Random Oversampling'] = (X_ros, y_ros)

# 2. Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
sampling_methods['Random Undersampling'] = (X_rus, y_rus)

# 3. Tomek Links (Undersampling)
tl = TomekLinks()
X_tl, y_tl = tl.fit_resample(X_train, y_train)
sampling_methods['Tomek Links'] = (X_tl, y_tl)

# 4. SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
sampling_methods['SMOTE'] = (X_smote, y_smote)

# 5. Class Weights (No resampling)
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

results = []

# Model on original data
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
results.append(['Original', 
                accuracy_score(y_test, y_pred),
                f1_score(y_test, y_pred),
                roc_auc_score(y_test, y_proba)])

# With Sampling Methods
for name, (X_samp, y_samp) in sampling_methods.items():
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_samp, y_samp)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    results.append([name, 
                    accuracy_score(y_test, y_pred),
                    f1_score(y_test, y_pred),
                    roc_auc_score(y_test, y_proba)])

# With Class Weights
clf = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
results.append(['Class Weights', 
                accuracy_score(y_test, y_pred),
                f1_score(y_test, y_pred),
                roc_auc_score(y_test, y_proba)])


In [6]:
results_df = pd.DataFrame(results, columns=['Technique', 'Accuracy', 'F1 Score', 'AUC'])
print(results_df.sort_values(by='F1 Score', ascending=False))


              Technique  Accuracy  F1 Score       AUC
5         Class Weights  0.620887  0.330225  0.567960
2  Random Undersampling  0.583763  0.325547  0.561667
1   Random Oversampling  0.629362  0.319038  0.562439
4                 SMOTE  0.619440  0.308358  0.554837
3           Tomek Links  0.759160  0.197871  0.568272
0              Original  0.759221  0.197749  0.567772


## 📊 Performance Comparison of Sampling Techniques

### 🔍 Key Observations:

- **Class Weights** achieved the **best F1 Score (0.3302)** and a competitive AUC (0.5680), making it the most balanced approach for handling class imbalance.
- **Random Undersampling (0.3255 F1 Score)** performed slightly worse but removes data, which may not be ideal.
- **Random Oversampling (0.3190 F1 Score)** and **SMOTE (0.3084 F1 Score)** improved class balance but didn’t outperform Class Weights.
- **Tomek Links** and the **Original dataset** had the highest accuracy (0.7592), but their **F1 Scores were the lowest (~0.198)**, showing they struggled with the minority class.
- **Tomek Links had the highest AUC (0.5683)**, indicating better class separation but a weaker F1 Score.

### 🏆 Best Sampling Technique:
✅ **Class Weights** is the best approach as it **maximizes F1 Score while maintaining a good AUC**, making it the most effective for handling imbalance in this dataset.

---

### 📌 Recommendation:
- If you prioritize **balanced performance** → Use **Class Weights**.
- If you want to **improve AUC** but don’t mind lower F1 Score → Use **Tomek Links**.
- If you prefer **more training data** rather than undersampling → Use **SMOTE or Random Oversampling**.

---
