<a href="https://colab.research.google.com/github/Mithix67/AIES-Practical/blob/main/Exp-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Name:-Mithul CE
Class:- B.Tech A DIV
PRN NO. 22SC114501026
Title:- Impact of Data Quality on AI Fairness.
     

In [1]:
!pip install fairlearn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from fairlearn.metrics import MetricFrame, true_positive_rate, false_positive_rate, false_negative_rate, selection_rate

# === Step 1: Create synthetic dataset ===
np.random.seed(42)
n = 500

df = pd.DataFrame({
    'closing_rank': np.random.randint(1, 5000, size=n),
    'category': np.random.choice(['GEN', 'OBC-NCL', 'SC', 'ST'], size=n, p=[0.4, 0.3, 0.2, 0.1]),
    'program_duration': np.random.choice(['3 Years', '4 Years'], size=n),
    'degree_short': np.random.choice(['BTech', 'BSc', 'BA'], size=n),
    'institute_short': np.random.choice(['IIT', 'NIT', 'IIIT'], size=n),
    'round_no': np.random.randint(1, 7, size=n),
    'opening_rank': np.random.randint(1, 5000, size=n)
})

# Target: Good Rank
df['GoodRank'] = (df['closing_rank'] < 2000).astype(int)

# Sensitive attribute
sensitive_feature = df['category']

# One-hot encode categorical features
df_encoded = pd.get_dummies(
    df[['program_duration', 'degree_short', 'institute_short']],
    drop_first=True
)
df_numeric = df[['round_no', 'opening_rank']].copy()
X = pd.concat([df_numeric, df_encoded], axis=1)
y = df['GoodRank']

# Train/Test Split
X_train, X_test, y_train, y_test, s_train, s_test = train_test_split(
    X, y, sensitive_feature, test_size=0.3, random_state=42, stratify=sensitive_feature
)

# Train Model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Fairness Metrics
metric_frame = MetricFrame(
    metrics={
        'TPR': true_positive_rate,
        'FPR': false_positive_rate,
        'FNR': false_negative_rate,
        'Selection Rate': selection_rate
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=s_test
)

print("🎯 Fairness Metrics by Category:\n")
print(metric_frame.by_group)

# Highlight lower TPR groups
if 'GEN' in metric_frame.by_group.index:
    gen_tpr = metric_frame.by_group.loc['GEN', 'TPR']
    print("\n🔍 Categories with significantly lower TPR than GEN:")
    for cat in metric_frame.by_group.index:
        if cat != 'GEN':
            cat_tpr = metric_frame.by_group.loc[cat, 'TPR']
            gap = gen_tpr - cat_tpr
            if gap > 0.05:
                print(f" - {cat}: TPR = {cat_tpr:.3f} (↓{gap:.3f})")
else:
    print("\n⚠ No 'GEN' category found.")


Collecting fairlearn
  Downloading fairlearn-0.12.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.12.0-py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fairlearn
Successfully installed fairlearn-0.12.0
🎯 Fairness Metrics by Category:

               TPR       FPR       FNR  Selection Rate
category                                              
GEN       0.200000  0.166667  0.800000        0.178571
OBC-NCL   0.000000  0.222222  1.000000        0.130435
SC        0.000000  0.157895  1.000000        0.088235
ST        0.166667  0.125000  0.833333        0.142857

🔍 Categories with significantly lower TPR than GEN:
 - OBC-NCL: TPR = 0.000 (↓0.200)
 - SC: TPR = 0.000 (↓0.200)
