In [1]:
DATA_DIR = '/Users/malithidesilva/fyp'

import pandas as pd
import os
features_list = pd.read_csv(os.path.join(DATA_DIR, 'features_REDISCOVER.csv'))
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'REDISCOVER_Rearranged_Cleaned_withNA_withFRS.csv'))

In [2]:
raw_data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'patient_ID', 'recruitment_year', 'area',
       'gender', 'age', 'age_grp', 'age_dist', 'menopause', 'smoking',
       'weight', 'height', 'bmi', 'bps', 'bpd', 'pulse', 'hip', 'waist', 'whr',
       'diabetes_b', 'diabetes_med', 'highchol_b', 'chol_med', 'hpt_b',
       'heartdisease_b', 'hepatitis_b', 'cancer_b', 'copd_b', 'asthma_b',
       'tuberculosis_b', 'malaria_b', 'stroke_b', 'hpt_med', 'stroke_med',
       'asthma_med', 'glucose_baseline', 'tc_baseline', 'tg_baseline',
       'hdl_baseline', 'ldl_baseline', 'hpt_f', 'diabetes_f', 'heartdisease_f',
       'stroke_f', 'cancer_f', 'handgrip_dominant', 'handgrip_nondominant',
       'rightarm', 'rightcalf', 'head', 'FEV1', 'FVC', 'PERF', 'FRS_score',
       'FRS_outcome', 'outcome', 'follow_up_date', 'year_diff',
       'FRS_score_coeff3'],
      dtype='object')

In [3]:
raw_data.shape

(11807, 60)

In [4]:
import math
def convert_10year_risk_to_3year(risk_10year):
    # Calculate the hazard rate from the 10-year risk
    hazard_rate = -math.log(1 - risk_10year) / 10
    
    # Calculate the 3-year risk using the hazard rate
    risk_3year = 1 - math.exp(-3 * hazard_rate)
    
    return risk_3year

In [5]:
len(raw_data['FRS_score'].to_list())

11807

In [6]:
total_prob_3years = []

for score in raw_data['FRS_score'].to_list():
    prob_3years = convert_10year_risk_to_3year(score)
    total_prob_3years.append(prob_3years)

frs_prob_3years = pd.DataFrame(total_prob_3years, columns=['FRS_score_coeff3'])

In [7]:
concatenated = pd.concat([raw_data, frs_prob_3years], axis=1)

In [8]:
concatenated.shape

(11807, 61)

In [9]:
concatenated.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,patient_ID,recruitment_year,area,gender,age,age_grp,age_dist,menopause,...,FEV1,FVC,PERF,FRS_score,FRS_outcome,outcome,follow_up_date,year_diff,FRS_score_coeff3,FRS_score_coeff3.1
0,0,0,tr1_004,2007,2,1,45,2,3,3,...,3.093333,3.183333,611.666667,0.092722,0,0,Not Provided,0,0.02877,0.02877
1,1,1,tr1_005,2007,2,1,50,2,4,3,...,3.296667,3.736667,322.666667,0.091211,0,0,Not Provided,0,0.028285,0.028285
2,2,2,tr1_006,2007,2,1,65,2,5,3,...,2.443333,3.403333,400.333333,0.345015,0,0,Not Provided,0,0.119216,0.119216
3,3,3,tr1_007,2007,2,1,42,2,3,3,...,2.603333,3.07,411.333333,0.14309,0,0,Not Provided,0,0.04527,0.04527
4,4,4,tr1_008,2007,2,1,49,2,3,3,...,3.083333,3.75,612.0,0.105351,0,0,Not Provided,0,0.032846,0.032846


## Compute FRS Performance

In [1]:
import os
import time
import pandas as pd
import joblib

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

def compute_cm(cm):
  ppv = {}
  npv = {}
  sensitivity = {}
  specificity = {}

  # Calculate metrics for each class
  for i in range(len(cm)):
      TP = cm[i, i]
      FP = cm[:, i].sum() - TP
      FN = cm[i, :].sum() - TP
      TN = cm.sum() - (TP + FP + FN)

      ppv[i] = TP / (TP + FP) if (TP + FP) > 0 else 0  # Precision or Positive Predictive Value
      npv[i] = TN / (TN + FN) if (TN + FN) > 0 else 0  # Negative Predictive Value
      sensitivity[i] = TP / (TP + FN) if (TP + FN) > 0 else 0  # Recall or Sensitivity
      specificity[i] = TN / (TN + FP) if (TN + FP) > 0 else 0  # Specificity

  # Convert to DataFrame for better readability
  metrics_df = pd.DataFrame({
      'PPV': ppv,
      'NPV': npv,
      'Sensitivity': sensitivity,
      'Specificity': specificity
  })
  return metrics_df

def calculate_metrics(y_test, y_pred, y_prob, average='weighted'):
    accuracy = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_prob, multi_class='ovr', average=average) if y_prob is not None else None
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    report_df = pd.DataFrame(report).transpose()
    cm_df = pd.DataFrame(cm)
    metrics_df = compute_cm(cm)
    return accuracy, auc_score, cm_df, report_df, metrics_df

def evaluate(pipeline_file, X_test, y_test, results_dir, results_file_prefix):
    accuracy, auc_score, y_pred, y_prob = None, None, None, None
    
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    if not os.path.exists(pipeline_file):
        print('Presaved pipeline was not found')
        return
    else:
        pipeline = joblib.load(pipeline_file)

        start_time = time.time()
        y_pred = pipeline.predict(X_test)
        y_prob = pipeline.predict_proba(X_test) if hasattr(pipeline, "predict_proba") else None

        # Calculate metrics
        accuracy, auc_score, cm, report, metrics_df = calculate_metrics(y_test, y_pred, y_prob)

        print(f'Accuracy: {accuracy}')
        print(f'AUC: {auc_score}')
        print(f'Time used for prediction(s): {time.time() - start_time}')

        # Save cm image & CSV
        cm.to_csv(os.path.join(results_dir, f'{results_file_prefix}_cm.csv'), index=True)
        # Save classification report
        report.to_csv(os.path.join(results_dir, f'{results_file_prefix}_class_report.csv'), index=True)


        pd.DataFrame([accuracy, auc_score, time.time() - start_time], 
            columns=['results'], index=['Accuracy', 'AUC', 'Prediction Time']).to_csv(os.path.join(results_dir, f'{results_file_prefix}_acc_auc.csv'), index=True)
        
        metrics_df.to_csv(os.path.join(results_dir, f'{results_file_prefix}_metrics_summary.csv'), index=True)

    return  accuracy, auc_score, cm, report, metrics_df, time.time()-start_time, y_pred, y_prob

In [2]:
DATA_DIR = '/Users/malithidesilva/fyp'
RESULT_DIR = '/Users/malithidesilva/fyp/model1'

FRS_RESULTS_DIR = os.path.join(RESULT_DIR, 'frs_related')
if not os.path.exists(FRS_RESULTS_DIR):
    os.makedirs(FRS_RESULTS_DIR)

In [3]:
frs_data = joblib.load(os.path.join(RESULT_DIR, 'frs_data.pkl'))

frs_prob=frs_data['X_test_frs']['FRS_score_coeff3'] # coefficient adjusted
# frs_prob=frs_data['X_test_frs']['FRS_score'] # 10 years
frs_pred=frs_data['X_test_frs']['FRS_outcome']

data = joblib.load(os.path.join(RESULT_DIR, 'train_test_data.pkl'))

y_test = pd.DataFrame(data['y_test'])

y_test.loc[y_test['outcome'] > 0, 'outcome'] = 1

print(y_test.shape, frs_pred.shape, frs_prob.shape)

(1727, 1) (1727,) (1727,)


In [4]:
accuracy, auc_score, cm, report, metrics_df = calculate_metrics(y_test, frs_pred, frs_prob)

print(f'Accuracy: {accuracy}')
print(f'AUC: {auc_score}')

# Save cm image & CSV
cm.to_csv(os.path.join(FRS_RESULTS_DIR, f'FRS_cm.csv'), index=True)
# Save classification report
report.to_csv(os.path.join(FRS_RESULTS_DIR, f'FRS_class_report.csv'), index=True)


pd.DataFrame([accuracy, auc_score], 
    columns=['results'], index=['Accuracy', 'AUC']).to_csv(os.path.join(FRS_RESULTS_DIR, f'FRS_acc_auc.csv'), index=True)

metrics_df.to_csv(os.path.join(FRS_RESULTS_DIR, f'FRS_metrics_summary.csv'), index=True)


Accuracy: 0.986682107701216
AUC: 0.8343940155194551
