In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score

# 1. Extended and varied dataset
data = {
    'ExperienceYears': [1, 3, 5, 2, 7, 6, 8, 2, 4, 9, 3, 5, 6, 1, 7, 3, 8, 5, 4, 9],
    'EducationLevel':  [2, 2, 3, 1, 4, 3, 3, 1, 2, 4, 2, 3, 4, 1, 3, 2, 4, 3, 2, 4],
    'PerformanceScore':[2, 3, 4, 2, 5, 4, 5, 2, 3, 5, 3, 4, 5, 2, 4, 3, 5, 4, 3, 5],
    'Gender': [
        'Female', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male',
        'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male'
    ],
    'HispanicLatino': [
        'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No',
        'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No'
    ],
    'Termd': [
        1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 1, 0, 1, 0, 0
    ]
}

df = pd.DataFrame(data)

# 2. Encode categorical variables
df_encoded = df.copy()
df_encoded['Gender'] = LabelEncoder().fit_transform(df_encoded['Gender'])  # Male=1, Female=0
df_encoded['HispanicLatino'] = LabelEncoder().fit_transform(df_encoded['HispanicLatino'])  # No=0, Yes=1

# 3. Define features and target
X = df_encoded.drop(columns='Termd')
y = df_encoded['Termd']

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# 6. Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# 7. Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.75      1.00      0.86         3

    accuracy                           0.83         6
   macro avg       0.88      0.83      0.83         6
weighted avg       0.88      0.83      0.83         6

AUC Score: 0.888888888888889


In [4]:
# 8. Create results DataFrame for bias analysis
results_df = X_test.copy()
results_df['true_label'] = y_test.values
results_df['predicted_prob'] = y_prob

In [5]:
# Add original labels for readability
results_df['Gender'] = df.loc[X_test.index, 'Gender'].values
results_df['HispanicLatino'] = df.loc[X_test.index, 'HispanicLatino'].values

In [6]:
# 9. Bias Analysis by Gender
print("\n--- Bias by Gender ---")
print(results_df.groupby('Gender')['predicted_prob'].mean())
for gender in results_df['Gender'].unique():
    subset = results_df[results_df['Gender'] == gender]
    auc = roc_auc_score(subset['true_label'], subset['predicted_prob'])
    print(f"AUC for {gender}: {auc:.3f}")


--- Bias by Gender ---
Gender
Female    0.767676
Male      0.240601
Name: predicted_prob, dtype: float64
AUC for Female: 0.667
AUC for Male: nan




In [7]:

# 10. Bias Analysis by HispanicLatino
print("\n--- Bias by HispanicLatino ---")
print(results_df.groupby('HispanicLatino')['predicted_prob'].mean())
for group in results_df['HispanicLatino'].unique():
    subset = results_df[results_df['HispanicLatino'] == group]
    auc = roc_auc_score(subset['true_label'], subset['predicted_prob'])
    print(f"AUC for {group}: {auc:.3f}")


--- Bias by HispanicLatino ---
HispanicLatino
No     0.406687
Yes    0.777282
Name: predicted_prob, dtype: float64
AUC for Yes: nan
AUC for No: nan


