# TrAdaBoost Transfer Learning ULSA -> UENG
# Model 3

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.stats import kstest
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

In [2]:
from transformers.features_missing_vals import imputation_pipeline
from transformers.ethnic_groups import CombineEthnicGroups

In [3]:
from transfer_learning.TrAdaBoost import TrAdaBoost

In [38]:
from model_evaluation.cross_val import direct_transfer_cv, TrAdaBoost_transfer_cv

# Params

In [5]:
source_school = 'ULSA'
target_school = 'UENG'

In [6]:
# Model features
cols = ['STDNT_FEMALE',
        'STDNT_AGE_BIN',
        'HS_GPA_Imp_Interp',
        'CURR_GPA',
        'HS_CALC_IND',
        'GROSS_FAM_INC',
        'SNGL_PRNT_IND',
        'STDNT_ETHNC_GRP_CD',
        'SAT_ACT_TOTAL_BIN',
        'PRNT_ED_LVL',
        'No_grades_at_all',
        'Grade_Overall_I_for_1_and_more_courses',
        'Grade_W_for_1_course',
        'Grade_W_for_2_courses',
        'Grade_W_for_3_and_more_courses',
        'Grade_NR_for_1_and_more_courses']

In [7]:
random_state = 1

In [8]:
clf = LogisticRegression(penalty=None, random_state=random_state, max_iter=10000)

In [9]:
# Parameters for fairness metrics

# Priviledged values of attributes
attribites_dic = {
    'STDNT_ETHNC_grouped': 0, # white
    'STDNT_FEMALE': 0 # male
    }

# The threshold for intervention is based on the percentile of predicted dropout probabilities, 
# specifically targeting the top 10% of students with the highest dropout probabilities
treshold=[10]

# Percentile values of predicted dropout probabilities that determine the risk ranking of a student
rank_thresholds = [90, 95, 97, 99, 99.5]

# Data

In [10]:
features_df = pd.read_csv('features_df.csv', low_memory=False)
source_df = features_df[features_df['PRMRY_CRER_CD']==source_school].copy()
target_df = features_df[features_df['PRMRY_CRER_CD']==target_school].copy()

# Reset target index to ensure that cross-validation indices will be accurately applied
target_df.reset_index(drop=True, inplace=True)

In [None]:
# Group together all ethnic groups, excluding white and asian populations, as they lack a sufficient 
# number of dropouts for a meaningful fairness analysis

# Add new column 'STDNT_ETHNC_grouped' with the following values: 0 for white, 1 for asian, and 2 for other
combine_transformer = CombineEthnicGroups()
target_df = combine_transformer.fit_transform(target_df)

target_df.groupby('STDNT_ETHNC_grouped')[['y_ENROLLED_1_YEAR_LATER']].agg(['count', 'sum'])

In [12]:
# Impute missing values and apply WoE transformation for the direct transfer
source_df = combine_transformer.fit_transform(source_df)
source_imp = imputation_pipeline.fit_transform(source_df, source_df['y_ENROLLED_1_YEAR_LATER'])
target_imp = imputation_pipeline.transform(target_df)

# Train and test data sets in cross-validation
The target dataset will be partitioned into a training set comprising 33.3% of the data and a testing set comprising 66.6%.
For TrAdaBoost, the training set will encompass the source dataset and 33.3% of the target dataset, while the testing set for TrAdaBoost will consist of the remaining 66.6% from the target dataset. This specific partition is chosen because the testing set needs to contain a sufficient number of dropouts for the evaluation of AUC and other metrics. In cases with a low dropout count, the confidence interval of AUC tends to be broad, making the AUC estimate less reliable. Specifically, 66.6% of the target sample includes 84-85 instances of student dropouts.

In [13]:
stratified_split = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
splits = stratified_split.split(target_df, target_df['y_ENROLLED_1_YEAR_LATER'])

In [14]:
# Generate indices for the training and testing datasets for every split in the cross-validation
train_target_indx = []
test_target_indx = []

for i, (test_index, train_index) in enumerate(splits):
    train_target_indx.append(train_index)
    test_target_indx.append(test_index)

In [None]:
# Check the cross-validation splits
for i in range(stratified_split.get_n_splits()):
    train_size = len(train_target_indx[i])
    test_size = len(test_target_indx[i])
    train_dropout = target_df.loc[train_target_indx[i], 'y_ENROLLED_1_YEAR_LATER'].mean()
    test_dropout = target_df.loc[test_target_indx[i], 'y_ENROLLED_1_YEAR_LATER'].mean()
    test_dropout_num = target_df.loc[test_target_indx[i], 'y_ENROLLED_1_YEAR_LATER'].sum()
    print(f"Split {i}: Training set size={train_size}, testing set size={test_size}")
    print(f"Split {i}: Training dropout rate={train_dropout:.2%}, testing dropout rate={test_dropout:.2%},", 
          f"testing number of dropouts={test_dropout_num:.0f}")

# Direct transfer
To ensure comparability between direct transfer and TrAdaBoost, the evaluation of direct transfer will be conducted using identical cross-validation test sets as for TrAdaBoost. Using different samples to compare these methods may result in variations in model evaluation metrics, which could be attributed to sample differences rather than indicating the superiority of one model over the other.

## Source model

In [16]:
# Fit the source model
clf_source = clone(clf)
clf_source.fit(source_imp[cols], source_imp['y_ENROLLED_1_YEAR_LATER'])

## Cross-validating the direct transfer of the source model
The performance of the source model 'clf_source' trained on the source dataset (refer to the cell above), is evaluated on three  target test sets, specified by the indices 'test_target_indx'. The evaluation includes classification accuracy and fairness metrics.

In [17]:
cv_res_direct = direct_transfer_cv(
    target_imp, # Target data set with imputed missing values
    test_target_indx, # Indices of the training sets for each cross-validation splits
    clf_source, cols, # Fitted source model and its features
    'Direct transfer', 
    attribites_dic, treshold, rank_thresholds) # Fairness metrics' parameters 

## Classification accuracy

### AUC

In [18]:
cv_res_direct['AUC']

Unnamed: 0,split_0,split_1,split_2,mean,se
Direct transfer,0.798675,0.811847,0.777766,0.796096,0.009923


### Pietra Index and Kolmogorov-Smirnov test

In [19]:
# Pietra index
cv_res_direct['pietra_index']

Unnamed: 0,split_0,split_1,split_2,mean,se
Direct transfer,0.48257,0.536203,0.465117,0.49463,0.021388


In [20]:
# KS p-value
cv_res_direct['KS_pvalue'].iloc[:, :3]

Unnamed: 0,split_0,split_1,split_2
Direct transfer,1.719591e-18,8.510172e-23,3.8736e-17


## Fairness metrics

### Sliced AUCs

In [21]:
cv_res_direct['sliced_AUCs']

Unnamed: 0,Attribute,Value,split_0,split_1,split_2,mean,se
0,STDNT_ETHNC_grouped,1,0.732277,0.744254,0.742032,0.739521,0.003678
1,STDNT_ETHNC_grouped,0,0.873095,0.851929,0.83124,0.852088,0.012083
2,STDNT_ETHNC_grouped,2,0.763595,0.821409,0.757019,0.780674,0.020456
3,STDNT_FEMALE,0,0.797804,0.794522,0.785411,0.792579,0.003707
4,STDNT_FEMALE,1,0.795982,0.871113,0.754783,0.807293,0.034054


### Equal opportunity difference

In [22]:
cv_res_direct['equal_opportunity_diff']

Unnamed: 0,Attribute,Value,split_0,split_1,split_2,mean,se
0,STDNT_ETHNC_grouped,1,-0.324675,-0.332927,-0.298701,-0.318768,0.010312
1,STDNT_ETHNC_grouped,0,0.0,0.0,0.0,0.0,0.0
2,STDNT_ETHNC_grouped,2,0.02381,0.012725,0.071429,0.035988,0.018007
3,STDNT_FEMALE,0,0.0,0.0,0.0,0.0,0.0
4,STDNT_FEMALE,1,0.111538,0.123793,0.008185,0.081172,0.036665


### Generalized entropy index

In [23]:
cv_res_direct['gen_entropy']

Unnamed: 0,split_0,split_1,split_2,mean,se
generalized_entropy,0.385417,0.323529,0.465909,0.391618,0.041218
between_groups,0.033422,0.028066,0.040829,0.034106,0.0037


### Generalized entropy index that takes into account the risk ranks of students

In [24]:
cv_res_direct['gen_entropy_ranks']

Unnamed: 0,split_0,split_1,split_2,mean,se
generalized_entropy_ranks,0.547744,0.457076,0.623618,0.542813,0.04814
between_groups_ranks,0.031853,0.023615,0.055756,0.037075,0.009639


# TrAdaBoost

In [25]:
TrAdaBoost_model = TrAdaBoost(estimator=clone(clf), max_num_iterations=10)

## Cross-validating the TrAdaBoost
In every cross-validation split, the TrAdaBoost model is trained using the source dataset combined with 33.3% of the target dataset. Subsequently, the model is evaluated on the remaining 66.6% of the target dataset. The evaluation includes classification accuracy and fairness metrics.

In [26]:
cv_res_TrAdaBoost = TrAdaBoost_transfer_cv(
    source_df, target_df, # Source and target data sets
    train_target_indx, # Indices of the training sets for cross-validation splits
    test_target_indx, # Indices of the testing sets for cross-validation splits
    imputation_pipeline, # Pipeline to impute missing values and apply WoE transformation
    TrAdaBoost_model, 
    cols, # Model features
    'TrAdaBoost', 
    attribites_dic, treshold, rank_thresholds) # Fairness metrics' parameters 

## Classification accuracy

### AUC

In [27]:
pd.concat([cv_res_TrAdaBoost['AUC'], cv_res_direct['AUC']])

Unnamed: 0,split_0,split_1,split_2,mean,se
TrAdaBoost,0.80235,0.820896,0.775099,0.799448,0.0133
Direct transfer,0.798675,0.811847,0.777766,0.796096,0.009923


### Pietra index and Kolmogorov-Smirnov test

In [28]:
# Pietra index
pd.concat([cv_res_TrAdaBoost['pietra_index'], cv_res_direct['pietra_index']])

Unnamed: 0,split_0,split_1,split_2,mean,se
TrAdaBoost,0.507105,0.549266,0.472291,0.509554,0.022254
Direct transfer,0.48257,0.536203,0.465117,0.49463,0.021388


In [29]:
# KS p-value
pd.concat([cv_res_TrAdaBoost['KS_pvalue'].iloc[:,:3], 
           cv_res_direct['KS_pvalue'].iloc[:,:3]])

Unnamed: 0,split_0,split_1,split_2
TrAdaBoost,1.6808999999999998e-20,5.40635e-24,1.0957840000000001e-17
Direct transfer,1.719591e-18,8.510172e-23,3.8736e-17


## Fairness metrics

### Sliced AUCs

In [30]:
cv_res_TrAdaBoost['sliced_AUCs']

Unnamed: 0,Attribute,Value,split_0,split_1,split_2,mean,se
0,STDNT_ETHNC_grouped,1,0.71717,0.791138,0.722858,0.743722,0.023765
1,STDNT_ETHNC_grouped,0,0.880152,0.854177,0.830707,0.855012,0.014279
2,STDNT_ETHNC_grouped,2,0.769356,0.807347,0.754226,0.776976,0.015801
3,STDNT_FEMALE,0,0.796315,0.795524,0.781653,0.791164,0.004761
4,STDNT_FEMALE,1,0.796569,0.880099,0.753136,0.809935,0.037255


In [31]:
cv_res_TrAdaBoost['sliced_AUCs'][['Attribute', 'Value', 'mean', 'se']].merge(
    cv_res_direct['sliced_AUCs'][['Attribute', 'Value', 'mean', 'se']], 
    on=['Attribute', 'Value'],
    suffixes=('_TrAdaBoost', '_direct')
    )[['Attribute', 'Value', 'mean_TrAdaBoost', 'mean_direct', 'se_TrAdaBoost', 'se_direct']]

Unnamed: 0,Attribute,Value,mean_TrAdaBoost,mean_direct,se_TrAdaBoost,se_direct
0,STDNT_ETHNC_grouped,1,0.743722,0.739521,0.023765,0.003678
1,STDNT_ETHNC_grouped,0,0.855012,0.852088,0.014279,0.012083
2,STDNT_ETHNC_grouped,2,0.776976,0.780674,0.015801,0.020456
3,STDNT_FEMALE,0,0.791164,0.792579,0.004761,0.003707
4,STDNT_FEMALE,1,0.809935,0.807293,0.037255,0.034054


### Equal opportunity difference

In [32]:
cv_res_TrAdaBoost['equal_opportunity_diff']

Unnamed: 0,Attribute,Value,split_0,split_1,split_2,mean,se
0,STDNT_ETHNC_grouped,1,-0.279221,-0.158537,-0.088312,-0.175356,0.055749
1,STDNT_ETHNC_grouped,0,0.0,0.0,0.0,0.0,0.0
2,STDNT_ETHNC_grouped,2,0.02381,0.037116,0.064286,0.041737,0.011911
3,STDNT_FEMALE,0,0.0,0.0,0.0,0.0,0.0
4,STDNT_FEMALE,1,0.030769,0.020193,-0.08631,-0.011782,0.037388


In [33]:
cv_res_TrAdaBoost['equal_opportunity_diff'][['Attribute', 'Value', 'mean', 'se']].merge(
    cv_res_direct['equal_opportunity_diff'][['Attribute', 'Value', 'mean', 'se']], 
    on=['Attribute', 'Value'],
    suffixes=('_TrAdaBoost', '_direct')
    )[['Attribute', 'Value', 'mean_TrAdaBoost', 'mean_direct', 'se_TrAdaBoost', 'se_direct']]

Unnamed: 0,Attribute,Value,mean_TrAdaBoost,mean_direct,se_TrAdaBoost,se_direct
0,STDNT_ETHNC_grouped,1,-0.175356,-0.318768,0.055749,0.010312
1,STDNT_ETHNC_grouped,0,0.0,0.0,0.0,0.0
2,STDNT_ETHNC_grouped,2,0.041737,0.035988,0.011911,0.018007
3,STDNT_FEMALE,0,0.0,0.0,0.0,0.0
4,STDNT_FEMALE,1,-0.011782,0.081172,0.037388,0.036665


### Generalized entropy index

In [34]:
cv_res_TrAdaBoost['gen_entropy']

Unnamed: 0,split_0,split_1,split_2,mean,se
generalized_entropy,0.367347,0.292453,0.423913,0.361238,0.038072
between_groups,0.023941,0.007034,0.005765,0.012247,0.005858


In [35]:
cv_res_TrAdaBoost['gen_entropy'][['mean', 'se']].merge(
    cv_res_direct['gen_entropy'][['mean', 'se']], 
    left_index=True, right_index=True,
    suffixes=('_TrAdaBoost', '_direct')
    )[['mean_TrAdaBoost', 'mean_direct', 'se_TrAdaBoost', 'se_direct']]

Unnamed: 0,mean_TrAdaBoost,mean_direct,se_TrAdaBoost,se_direct
generalized_entropy,0.361238,0.391618,0.038072,0.041218
between_groups,0.012247,0.034106,0.005858,0.0037


### Generalized entropy index that takes into account the risk ranks of students

In [36]:
cv_res_TrAdaBoost['gen_entropy_ranks']

Unnamed: 0,split_0,split_1,split_2,mean,se
generalized_entropy_ranks,0.542604,0.434615,0.594399,0.523873,0.047067
between_groups_ranks,0.027645,0.010681,0.040143,0.026156,0.008537


In [37]:
cv_res_TrAdaBoost['gen_entropy_ranks'][['mean', 'se']].merge(
    cv_res_direct['gen_entropy_ranks'][['mean', 'se']], 
    left_index=True, right_index=True,
    suffixes=('_TrAdaBoost', '_direct')
    )[['mean_TrAdaBoost', 'mean_direct', 'se_TrAdaBoost', 'se_direct']]

Unnamed: 0,mean_TrAdaBoost,mean_direct,se_TrAdaBoost,se_direct
generalized_entropy_ranks,0.523873,0.542813,0.047067,0.04814
between_groups_ranks,0.026156,0.037075,0.008537,0.009639
