<a href="https://colab.research.google.com/github/Jdasanja/masters_thesis_final/blob/main/ACSDV_Calculation_4_va_longbeach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import pandas for reading and displaying local file
import pandas as pd

In [None]:
# Load the dataset from the raw GitHub URL
url = 'https://raw.githubusercontent.com/Jdasanja/masers_thesis/main/processed.va.data'
df = pd.read_csv(url, header=None)

# Add column names (based on standard attribute names for the Cleveland dataset)
df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
              'oldpeak', 'slope', 'ca', 'thal', 'num']

# Add a unique identifier column
df['patient_id'] = range(1, len(df) + 1)

# Set 'patient_id' as the index
df.set_index('patient_id', inplace=True)


# Display first few rows of the uploaded file with headers
print("First few rows of the uploaded file with column headers:")
print(df.head())


First few rows of the uploaded file with column headers:
            age  sex  cp trestbps chol fbs  restecg thalach exang oldpeak  \
patient_id                                                                  
1            63    1   4      140  260   0        1     112     1       3   
2            44    1   4      130  209   0        1     127     0       0   
3            60    1   4      132  218   0        1     140     1     1.5   
4            55    1   4      142  228   0        1     149     1     2.5   
5            66    1   3      110  213   1        2      99     1     1.3   

           slope ca thal  num  
patient_id                     
1              2  ?    ?    2  
2              ?  ?    ?    0  
3              3  ?    ?    2  
4              1  ?    ?    1  
5              2  ?    ?    0  


In [None]:
# Create a new dataframe with a new binary column based on the values in 'num'
df_binary_col = df.copy()
df_binary_col['num_binary'] = df_binary_col['num'].apply(lambda x: 1 if x in [1, 2, 3, 4] else 0)

df_binary_col = df_binary_col.drop('num', axis=1)

In [None]:
import numpy as np

## Replace question marks with NaN
df_binary_col.replace('?', np.nan, inplace=True)

## Total count of missing values
df_binary_col.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,56
chol,7
fbs,7
restecg,0
thalach,53
exang,53
oldpeak,56


In [None]:

# Explore the training set
print("DataFrame.info():")
df_binary_col.info()

print("\nDataFrame.describe():")
print(df_binary_col.describe())


DataFrame.info():
<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   age         200 non-null    int64 
 1   sex         200 non-null    int64 
 2   cp          200 non-null    int64 
 3   trestbps    144 non-null    object
 4   chol        193 non-null    object
 5   fbs         193 non-null    object
 6   restecg     200 non-null    int64 
 7   thalach     147 non-null    object
 8   exang       147 non-null    object
 9   oldpeak     144 non-null    object
 10  slope       98 non-null     object
 11  ca          2 non-null      object
 12  thal        34 non-null     object
 13  num_binary  200 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 23.4+ KB

DataFrame.describe():
              age         sex          cp     restecg  num_binary
count  200.000000  200.000000  200.000000  200.000000  200.000000
mean    59.350000    0.970000    3.5

In [None]:
# Convert the entire DataFrame to numeric
df_binary_col = df_binary_col.apply(pd.to_numeric, errors='coerce')

# Print info to verify conversion
print("DataFrame info after conversion to numeric:")
df_binary_col.info()

# Print summary statistics
print("\nDataFrame.describe():")
print(df_binary_col.describe())


DataFrame info after conversion to numeric:
<class 'pandas.core.frame.DataFrame'>
Index: 200 entries, 1 to 200
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         200 non-null    int64  
 1   sex         200 non-null    int64  
 2   cp          200 non-null    int64  
 3   trestbps    144 non-null    float64
 4   chol        193 non-null    float64
 5   fbs         193 non-null    float64
 6   restecg     200 non-null    int64  
 7   thalach     147 non-null    float64
 8   exang       147 non-null    float64
 9   oldpeak     144 non-null    float64
 10  slope       98 non-null     float64
 11  ca          2 non-null      float64
 12  thal        34 non-null     float64
 13  num_binary  200 non-null    int64  
dtypes: float64(9), int64(5)
memory usage: 23.4 KB

DataFrame.describe():
              age         sex          cp    trestbps        chol         fbs  \
count  200.000000  200.000000  200.000000  14

In [None]:
import pandas as pd
import numpy as np

# Drop columns with very high missing values
df_binary_col = df_binary_col.drop(columns=['ca', 'thal'], errors='ignore')

# Mean imputation for specific numerical columns (normally distributed columns)
mean_cols = ['trestbps', 'thalach']  # Normally distributed columns
for col in mean_cols:
    mean_value = df_binary_col[col].mean()
    df_binary_col[col] = df_binary_col[col].fillna(mean_value)

# Median imputation for skewed numerical columns
median_cols = ['chol', 'oldpeak']  # Skewed columns
for col in median_cols:
    median_value = df_binary_col[col].median()
    df_binary_col[col] = df_binary_col[col].fillna(median_value)

# Mode imputation for categorical or binary columns
mode_cols = ['fbs', 'exang', 'slope']  # Binary/categorical columns
for col in mode_cols:
    mode_value = df_binary_col[col].mode()[0]  # Get the most frequent value
    df_binary_col[col] = df_binary_col[col].fillna(mode_value)

# Handling outliers using the IQR method
def handle_outliers_with_IQR(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[column] = np.clip(data[column], lower_bound, upper_bound)

# Define continuous columns for outlier handling
continuous_cols = ['trestbps', 'chol', 'thalach', 'oldpeak']

# Apply outlier handling to the DataFrame
for col in continuous_cols:
    handle_outliers_with_IQR(df_binary_col, col)

# Drop any remaining rows with missing values
df_binary_col = df_binary_col.dropna()

# Verify no missing values are present
print("Missing values in df_binary_col after all modifications:")
print(df_binary_col.isnull().sum())

# Verify outliers are clipped
print("After outlier handling - data overview:")
print(df_binary_col[continuous_cols].describe())


Missing values in df_binary_col after all modifications:
age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
num_binary    0
dtype: int64
After outlier handling - data overview:
         trestbps        chol     thalach     oldpeak
count  200.000000  200.000000  200.000000  200.000000
mean   133.727014  179.971875  122.575918    1.361000
std     14.004410  112.032535   17.357928    0.915406
min    104.375000    0.000000   85.000000   -0.500000
25%    125.750000  129.250000  112.000000    1.000000
50%    133.763889  216.000000  122.795918    1.500000
75%    140.000000  254.500000  130.000000    2.000000
max    161.375000  442.375000  157.000000    3.500000


In [None]:
# Count instances where 'chol' is 0
chol_zero_count = df_binary_col[df_binary_col['chol'] == 0].shape[0]
print(f"Number of instances where 'chol' is 0: {chol_zero_count}")

# Count instances where 'trestbps' is 0
trestbps_count = df_binary_col[df_binary_col['trestbps'] == 0].shape[0]
print(f"Number of instances where 'trestbps' is 0: {trestbps_count}")

# Remove all rows where 'chol' or 'trestbps' is 0
df_binary_col = df_binary_col[(df_binary_col['chol'] != 0) & (df_binary_col['trestbps'] != 0)]

# Verify rows are removed
print("After removal:")
print(f"Number of instances where 'chol' is 0: {df_binary_col[df_binary_col['chol'] == 0].shape[0]}")
print(f"Number of instances where 'trestbps' is 0: {df_binary_col[df_binary_col['trestbps'] == 0].shape[0]}")


Number of instances where 'chol' is 0: 49
Number of instances where 'trestbps' is 0: 0
After removal:
Number of instances where 'chol' is 0: 0
Number of instances where 'trestbps' is 0: 0


In [None]:
#pip install ascvd

In [None]:
import math
import pandas as pd

# Provided function for the 10-year risk score calculation
def compute_ten_year_score(
    isMale,
    isBlack,
    smoker,
    hypertensive,
    diabetic,
    age,
    systolicBloodPressure,
    totalCholesterol,
    hdl,
):
    """
    Compute the 10-year cardiovascular risk score.
    """
    if age < 40 or age > 79:
        return None
    lnAge = math.log(age)
    lnTotalChol = math.log(totalCholesterol)
    lnHdl = math.log(hdl)
    trlnsbp = math.log(systolicBloodPressure) if hypertensive else 0
    ntlnsbp = 0 if hypertensive else math.log(systolicBloodPressure)
    ageTotalChol = lnAge * lnTotalChol
    ageHdl = lnAge * lnHdl
    agetSbp = lnAge * trlnsbp
    agentSbp = lnAge * ntlnsbp
    ageSmoke = lnAge if smoker else 0
    if isBlack and not isMale:
        s010Ret = 0.95334
        mnxbRet = 86.6081
        predictRet = (
            17.1141 * lnAge
            + 0.9396 * lnTotalChol
            + -18.9196 * lnHdl
            + 4.4748 * ageHdl
            + 29.2907 * trlnsbp
            + -6.4321 * agetSbp
            + 27.8197 * ntlnsbp
            + -6.0873 * agentSbp
            + (0.6908 if smoker else 0)
            + (0.8738 if diabetic else 0)
        )
    elif not isBlack and not isMale:
        s010Ret = 0.96652
        mnxbRet = -29.1817
        predictRet = (
            -29.799 * lnAge
            + 4.884 * lnAge ** 2
            + 13.54 * lnTotalChol
            + -3.114 * ageTotalChol
            + -13.578 * lnHdl
            + 3.149 * ageHdl
            + 2.019 * trlnsbp
            + 1.957 * ntlnsbp
            + (7.574 if smoker else 0)
            + -1.665 * ageSmoke
            + (0.661 if diabetic else 0)
        )
    elif isBlack and isMale:
        s010Ret = 0.89536
        mnxbRet = 19.5425
        predictRet = (
            2.469 * lnAge
            + 0.302 * lnTotalChol
            + -0.307 * lnHdl
            + 1.916 * trlnsbp
            + 1.809 * ntlnsbp
            + (0.549 if smoker else 0)
            + (0.645 if diabetic else 0)
        )
    else:
        s010Ret = 0.91436
        mnxbRet = 61.1816
        predictRet = (
            12.344 * lnAge
            + 11.853 * lnTotalChol
            + -2.664 * ageTotalChol
            + -7.99 * lnHdl
            + 1.769 * ageHdl
            + 1.797 * trlnsbp
            + 1.764 * ntlnsbp
            + (7.837 if smoker else 0)
            + -1.795 * ageSmoke
            + (0.658 if diabetic else 0)
        )

    pct = 1 - s010Ret ** math.exp(predictRet - mnxbRet)
    return round(pct * 100 * 10) / 10




# Assuming values for HDL, Black ethnicity status
df_binary_col['hdl'] = 50  # Placeholder HDL value
df_binary_col['isBlack'] = False  # Assume False for all; adjust if known
# Assign value 1 if 'trestbps' is over 130, else 0
df_binary_col['hypertension'] = df_binary_col['trestbps'].apply(lambda x: 1 if x > 130 else 0)

# Verify the column creation
print(df_binary_col[['trestbps', 'hypertension']].head())


            trestbps  hypertension
patient_id                        
1              140.0             1
2              130.0             0
3              132.0             1
4              142.0             1
5              110.0             0


In [None]:
# Verify no missing values are present
print("Missing values in df_binary_col after all modifications:")
print(df_binary_col.isnull().sum())

# Verify outliers are clipped
print("After outlier handling - data overview:")
print(df_binary_col[continuous_cols].describe())


Missing values in df_binary_col after all modifications:
age             0
sex             0
cp              0
trestbps        0
chol            0
fbs             0
restecg         0
thalach         0
exang           0
oldpeak         0
slope           0
num_binary      0
hdl             0
isBlack         0
hypertension    0
dtype: int64
After outlier handling - data overview:
         trestbps        chol     thalach     oldpeak
count  151.000000  151.000000  151.000000  151.000000
mean   134.912252  238.373344  122.390864    1.368874
std     13.890791   51.353811   17.491866    0.918418
min    104.375000  100.000000   85.000000    0.000000
25%    130.000000  209.500000  112.000000    1.000000
50%    133.763889  225.000000  122.795918    1.500000
75%    142.000000  270.000000  130.000000    1.850000
max    161.375000  442.375000  157.000000    3.500000


In [None]:
# Calculate risk scores
df_binary_col['ten_year_risk_score'] = df_binary_col.apply(lambda x: compute_ten_year_score(
    isMale=bool(x['sex']),
    isBlack=x['isBlack'],
    smoker=bool(x['exang']),
    hypertensive=bool(x['hypertension']),
    diabetic=bool(x['fbs']),
    age=int(x['age']),
    systolicBloodPressure=int(x['trestbps']),
    totalCholesterol=int(x['chol']),
    hdl=int(x['hdl'])
), axis=1)

# Display the DataFrame with calculated risk scores
df_binary_col[['age', 'sex', 'ten_year_risk_score']]

Unnamed: 0_level_0,age,sex,ten_year_risk_score
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,63,1,25.5
2,44,1,2.0
3,60,1,17.9
4,55,1,16.5
5,66,1,26.8
...,...,...,...
196,54,0,6.5
197,62,1,14.0
198,55,1,11.0
199,58,1,46.5


In [None]:
print(df_binary_col[['chol', 'hdl', 'trestbps']].describe())


             chol    hdl    trestbps
count  151.000000  151.0  151.000000
mean   238.373344   50.0  134.912252
std     51.353811    0.0   13.890791
min    100.000000   50.0  104.375000
25%    209.500000   50.0  130.000000
50%    225.000000   50.0  133.763889
75%    270.000000   50.0  142.000000
max    442.375000   50.0  161.375000


In [None]:
invalid_rows = df_binary_col[
    (df_binary_col['chol'] <= 0) |
    (df_binary_col['hdl'] <= 0) |
    (df_binary_col['trestbps'] <= 0)
]
print("Invalid rows causing math domain error:")
print(invalid_rows)


Invalid rows causing math domain error:
Empty DataFrame
Columns: [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, num_binary, hdl, isBlack, hypertension, ten_year_risk_score]
Index: []


In [None]:
# Define a binary categorization function
def binary_risk_category(score):
    return 1 if score >= 7.5 else 0

# Apply the binary categorization to the ten_year_risk_score column
df_binary_col['binary_risk_category'] = df_binary_col['ten_year_risk_score'].apply(binary_risk_category)

# Display the DataFrame with the binary risk category
df_binary_col[['age', 'sex', 'ten_year_risk_score', 'binary_risk_category']]

Unnamed: 0_level_0,age,sex,ten_year_risk_score,binary_risk_category
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,63,1,25.5,1
2,44,1,2.0,0
3,60,1,17.9,1
4,55,1,16.5,1
5,66,1,26.8,1
...,...,...,...,...
196,54,0,6.5,0
197,62,1,14.0,1
198,55,1,11.0,1
199,58,1,46.5,1


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Ground truth labels for comparison
y_true = df_binary_col['num_binary']  # Assuming 'num' is the ground truth label for risk

# Calculate metrics for the PCE model
brc_accuracy = accuracy_score(y_true, df_binary_col['binary_risk_category'])
brc_precision = precision_score(y_true, df_binary_col['binary_risk_category'])
brc_recall = recall_score(y_true, df_binary_col['binary_risk_category'])
brc_f1 = f1_score(y_true, df_binary_col['binary_risk_category'])
brc_auc = roc_auc_score(y_true, df_binary_col['binary_risk_category'])

In [None]:
# Display comparison results
print("BRC Model Performance:")
print(f"Accuracy: {brc_accuracy}")
print(f"Precision: {brc_precision}")
print(f"Recall: {brc_recall}")
print(f"F1 Score: {brc_f1}")
print(f"AUC-ROC: {brc_auc}")

BRC Model Performance:
Accuracy: 0.7682119205298014
Precision: 0.7894736842105263
Recall: 0.9375
F1 Score: 0.8571428571428571
AUC-ROC: 0.609775641025641
