<a href="https://colab.research.google.com/github/MokidiSrinidhi/XAI/blob/main/XAI_LAB_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install dice-ml

Collecting dice-ml
  Downloading dice_ml-0.12-py3-none-any.whl.metadata (20 kB)
Collecting raiutils>=0.4.0 (from dice-ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading dice_ml-0.12-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading raiutils-0.4.2-py3-none-any.whl (17 kB)
Installing collected packages: raiutils, dice-ml
Successfully installed dice-ml-0.12 raiutils-0.4.2


In [3]:
# Assignment 07 - Counterfactual Explanations
# Dataset: Titanic Dataset (using KaggleHub)
# ============================

# Step 1: Install dependencies (Run this once in Colab or terminal)
# !pip install kagglehub dice-ml scikit-learn pandas numpy matplotlib

# Step 2: Import Libraries
import kagglehub
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import dice_ml

# =========================================
# PART 1: Load Dataset using KaggleHub
# =========================================
print("Downloading dataset from Kaggle...")
path = kagglehub.dataset_download("yasserh/titanic-dataset")
print("Path to dataset files:", path)

# List all files to identify the CSV file
files = os.listdir(path)
print("Files in dataset folder:", files)

# Dynamically pick the first CSV file
csv_files = [f for f in files if f.endswith('.csv')]
if len(csv_files) == 0:
    raise FileNotFoundError("No CSV files found in the dataset folder.")

file_path = os.path.join(path, csv_files[0])
print("Using dataset file:", file_path)

# Load the Titanic dataset
df = pd.read_csv(file_path)
print("\nDataset Loaded Successfully!")
print(df.head())

# =========================================
# PART 2: Preprocessing
# =========================================
print("\nMissing Values before handling:")
print(df.isnull().sum())

# Fill missing numerical columns with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing categorical columns with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop irrelevant columns
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df['Embarked'] = encoder.fit_transform(df['Embarked'])

# Define features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Scale features and keep dataframe format
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# =========================================
# PART 3: Train Models
# =========================================
lr_model = LogisticRegression()
rf_model = RandomForestClassifier(random_state=42)

lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Evaluate Models
print("\nModel Evaluation Results:")
for model, name in zip([lr_model, rf_model], ['Logistic Regression', 'Random Forest']):
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation Metrics:")
    print(classification_report(y_test, y_pred))

# =========================================
# PART 4: Counterfactual Generation using DiCE
# =========================================
# Prepare dataframe for DiCE
df_for_dice = pd.concat([X_scaled, y.reset_index(drop=True)], axis=1)

continuous_features = ['Age', 'Fare']

# Create data object for DiCE
data_dice = dice_ml.Data(
    dataframe=df_for_dice,
    continuous_features=continuous_features,
    outcome_name='Survived'
)

# Wrap the trained Random Forest model for DiCE
model_dice = dice_ml.Model(model=rf_model, backend='sklearn')

# Initialize DiCE explainer
exp = dice_ml.Dice(data_dice, model_dice)

# Select a negative instance (predicted as 0 = Did Not Survive)
preds = rf_model.predict(X_test)
negative_indices = np.where(preds == 0)[0]

if len(negative_indices) == 0:
    raise ValueError("No negative predictions found in the test set.")

# Pick first negative instance
test_instance = X_test.iloc[[negative_indices[0]]]

print("\nSelected Test Instance for Counterfactuals:\n")
print(test_instance)

# Generate 3 counterfactuals
counterfactuals = exp.generate_counterfactuals(
    test_instance,
    total_CFs=3,
    desired_class="opposite"
)

# Extract counterfactual examples directly (avoid visualize_as_dataframe)
cf_df = counterfactuals.cf_examples_list[0].final_cfs_df
print("\nCounterfactual Explanations:")
print(cf_df)

# =========================================
# PART 5: Present Results
# =========================================
# Combine original instance and counterfactuals
original_instance = test_instance.reset_index(drop=True)
combined_df = pd.concat(
    [original_instance, cf_df.reset_index(drop=True)],
    keys=['Original', 'Counterfactuals']
)

print("\nOriginal Instance vs Counterfactuals:\n")
print(combined_df)

# Identify which features changed
print("\nFeatures Changed for Counterfactuals:")
for col in X.columns:
    original_val = original_instance.iloc[0][col]
    cf_vals = cf_df[col].unique()
    if not all(val == original_val for val in cf_vals):
        print(f"- {col}: Original = {original_val}, Counterfactual(s) = {cf_vals}")


Downloading dataset from Kaggle...
Using Colab cache for faster access to the 'titanic-dataset' dataset.
Path to dataset files: /kaggle/input/titanic-dataset
Files in dataset folder: ['Titanic-Dataset.csv']
Using dataset file: /kaggle/input/titanic-dataset/Titanic-Dataset.csv

Dataset Loaded Successfully!
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)



Model Evaluation Results:

Logistic Regression Evaluation Metrics:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


Random Forest Evaluation Metrics:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       105
           1       0.82      0.74      0.78        74

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179


Selected Test Instance for Counterfactuals:

     PassengerId    Pclass       Sex       Age     SibSp    Parch      Fare  \
709     1.026401  0.827377  0.737695 -0.104637  0.432793  0.76763 -0.341452   

     Embarked  
709 -1.942303  


  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
100%|██████████| 1/1 [00:00<00:00,  5.38it/s]


Counterfactual Explanations:
   PassengerId    Pclass       Sex       Age     SibSp     Parch      Fare  \
0     1.026401 -1.566107  0.737695 -1.693266  0.432793  0.767630 -0.341452   
1     1.026401  0.827377 -1.355574 -0.104637  0.432793  2.008933 -0.341452   
2     1.026401  0.827377 -1.355574 -0.104637  0.432793  0.767630  7.495231   

   Embarked  Survived  
0 -1.942303         1  
1 -1.942303         1  
2 -1.942303         1  

Original Instance vs Counterfactuals:

                   PassengerId    Pclass       Sex       Age     SibSp  \
Original        0     1.026401  0.827377  0.737695 -0.104637  0.432793   
Counterfactuals 0     1.026401 -1.566107  0.737695 -1.693266  0.432793   
                1     1.026401  0.827377 -1.355574 -0.104637  0.432793   
                2     1.026401  0.827377 -1.355574 -0.104637  0.432793   

                      Parch      Fare  Embarked  Survived  
Original        0  0.767630 -0.341452 -1.942303       NaN  
Counterfactuals 0  0.767630 -0


