<a href="https://colab.research.google.com/github/Jdasanja/masters_thesis_final/blob/main/ACSVD_calculation_of_Cleveland.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# import pandas for reading and displaying local file
import pandas as pd


In [None]:

# Load the dataset from the raw GitHub URL
url = 'https://raw.githubusercontent.com/Jdasanja/masers_thesis/main/processed.cleveland.data'
df = pd.read_csv(url, header=None)

# Add column names (based on standard attribute names for the Cleveland dataset)
df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
              'oldpeak', 'slope', 'ca', 'thal', 'num']

# Add a unique identifier column
df['patient_id'] = range(1, len(df) + 1)

# Set 'patient_id' as the index
df.set_index('patient_id', inplace=True)


# Display first few rows of the uploaded file with headers
print("First few rows of the uploaded file with column headers:")
print(df.head())


First few rows of the uploaded file with column headers:
             age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  \
patient_id                                                                  
1           63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0   
2           67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0   
3           67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0   
4           37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0   
5           41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0   

            oldpeak  slope   ca thal  num  
patient_id                                 
1               2.3    3.0  0.0  6.0    0  
2               1.5    2.0  3.0  3.0    2  
3               2.6    2.0  2.0  7.0    1  
4               3.5    3.0  0.0  3.0    0  
5               1.4    1.0  0.0  3.0    0  


In [None]:
# Create a new dataframe with a new binary column based on the values in 'num'
df_binary_col = df.copy()
df_binary_col['num_binary'] = df_binary_col['num'].apply(lambda x: 1 if x in [1, 2, 3, 4] else 0)

df_binary_col = df_binary_col.drop('num', axis=1)

In [None]:
pip install ascvd

Collecting ascvd
  Downloading ascvd-0.5-py3-none-any.whl.metadata (1.6 kB)
Downloading ascvd-0.5-py3-none-any.whl (4.0 kB)
Installing collected packages: ascvd
Successfully installed ascvd-0.5


In [None]:
import math
import pandas as pd

# Provided function for the 10-year risk score calculation
def compute_ten_year_score(
    isMale,
    isBlack,
    smoker,
    hypertensive,
    diabetic,
    age,
    systolicBloodPressure,
    totalCholesterol,
    hdl,
):
    """
    Compute the 10-year cardiovascular risk score.
    """
    if age < 40 or age > 79:
        return None
    lnAge = math.log(age)
    lnTotalChol = math.log(totalCholesterol)
    lnHdl = math.log(hdl)
    trlnsbp = math.log(systolicBloodPressure) if hypertensive else 0
    ntlnsbp = 0 if hypertensive else math.log(systolicBloodPressure)
    ageTotalChol = lnAge * lnTotalChol
    ageHdl = lnAge * lnHdl
    agetSbp = lnAge * trlnsbp
    agentSbp = lnAge * ntlnsbp
    ageSmoke = lnAge if smoker else 0
    if isBlack and not isMale:
        s010Ret = 0.95334
        mnxbRet = 86.6081
        predictRet = (
            17.1141 * lnAge
            + 0.9396 * lnTotalChol
            + -18.9196 * lnHdl
            + 4.4748 * ageHdl
            + 29.2907 * trlnsbp
            + -6.4321 * agetSbp
            + 27.8197 * ntlnsbp
            + -6.0873 * agentSbp
            + (0.6908 if smoker else 0)
            + (0.8738 if diabetic else 0)
        )
    elif not isBlack and not isMale:
        s010Ret = 0.96652
        mnxbRet = -29.1817
        predictRet = (
            -29.799 * lnAge
            + 4.884 * lnAge ** 2
            + 13.54 * lnTotalChol
            + -3.114 * ageTotalChol
            + -13.578 * lnHdl
            + 3.149 * ageHdl
            + 2.019 * trlnsbp
            + 1.957 * ntlnsbp
            + (7.574 if smoker else 0)
            + -1.665 * ageSmoke
            + (0.661 if diabetic else 0)
        )
    elif isBlack and isMale:
        s010Ret = 0.89536
        mnxbRet = 19.5425
        predictRet = (
            2.469 * lnAge
            + 0.302 * lnTotalChol
            + -0.307 * lnHdl
            + 1.916 * trlnsbp
            + 1.809 * ntlnsbp
            + (0.549 if smoker else 0)
            + (0.645 if diabetic else 0)
        )
    else:
        s010Ret = 0.91436
        mnxbRet = 61.1816
        predictRet = (
            12.344 * lnAge
            + 11.853 * lnTotalChol
            + -2.664 * ageTotalChol
            + -7.99 * lnHdl
            + 1.769 * ageHdl
            + 1.797 * trlnsbp
            + 1.764 * ntlnsbp
            + (7.837 if smoker else 0)
            + -1.795 * ageSmoke
            + (0.658 if diabetic else 0)
        )

    pct = 1 - s010Ret ** math.exp(predictRet - mnxbRet)
    return round(pct * 100 * 10) / 10




# Assuming values for HDL, Black ethnicity status
df_binary_col['hdl'] = 50  # Placeholder HDL value
df_binary_col['isBlack'] = False  # Assume False for all; adjust if known
df_binary_col['hypertension'] = df_binary_col['trestbps'].apply(lambda x: 1 if x >= 130 else 0)

In [None]:
# Calculate risk scores
df_binary_col['ten_year_risk_score'] = df_binary_col.apply(lambda x: compute_ten_year_score(
    isMale=bool(x['sex']),
    isBlack=x['isBlack'],
    smoker=bool(x['exang']),
    hypertensive=bool(x['hypertension']),
    diabetic=bool(x['fbs']),
    age=int(x['age']),
    systolicBloodPressure=int(x['trestbps']),
    totalCholesterol=int(x['chol']),
    hdl=int(x['hdl'])
), axis=1)

# Display the DataFrame with calculated risk scores
df_binary_col[['age', 'sex', 'ten_year_risk_score']]

Unnamed: 0_level_0,age,sex,ten_year_risk_score
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,63.0,1.0,31.1
2,67.0,1.0,37.5
3,67.0,1.0,18.7
4,37.0,1.0,
5,41.0,0.0,1.1
...,...,...,...
299,45.0,1.0,2.5
300,68.0,1.0,37.5
301,57.0,1.0,9.0
302,57.0,0.0,4.3


In [None]:
# Define a binary categorization function
def binary_risk_category(score):
    return 1 if score >= 7.5 else 0

# Apply the binary categorization to the ten_year_risk_score column
df_binary_col['binary_risk_category'] = df_binary_col['ten_year_risk_score'].apply(binary_risk_category)

# Display the DataFrame with the binary risk category
df_binary_col[['age', 'sex', 'ten_year_risk_score', 'binary_risk_category']]

Unnamed: 0_level_0,age,sex,ten_year_risk_score,binary_risk_category
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,63.0,1.0,31.1,1
2,67.0,1.0,37.5,1
3,67.0,1.0,18.7,1
4,37.0,1.0,,0
5,41.0,0.0,1.1,0
...,...,...,...,...
299,45.0,1.0,2.5,0
300,68.0,1.0,37.5,1
301,57.0,1.0,9.0,1
302,57.0,0.0,4.3,0


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Ground truth labels for comparison
y_true = df_binary_col['num_binary']  # Assuming 'num' is the ground truth label for risk

# Calculate metrics for the PCE model
brc_accuracy = accuracy_score(y_true, df_binary_col['binary_risk_category'])
brc_precision = precision_score(y_true, df_binary_col['binary_risk_category'])
brc_recall = recall_score(y_true, df_binary_col['binary_risk_category'])
brc_f1 = f1_score(y_true, df_binary_col['binary_risk_category'])
brc_auc = roc_auc_score(y_true, df_binary_col['binary_risk_category'])


In [None]:

# Display comparison results
print("BRC Model Performance:")
print(f"Accuracy: {brc_accuracy}")
print(f"Precision: {brc_precision}")
print(f"Recall: {brc_recall}")
print(f"F1 Score: {brc_f1}")
print(f"AUC-ROC: {brc_auc}")

BRC Model Performance:
Accuracy: 0.6963696369636964
Precision: 0.6358381502890174
Recall: 0.7913669064748201
F1 Score: 0.7051282051282052
AUC-ROC: 0.7036102825057028


<h1> Male Vs. Female </h1>

In [None]:
# Splitting the dataset into male and female based on the 'sex' column
df_male = df_binary_col[df_binary_col['sex'] == 1].reset_index(drop=True)
df_female = df_binary_col[df_binary_col['sex'] == 0].reset_index(drop=True)

<h1> Female </h1>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Ground truth labels for comparison
y_true = df_female['num_binary']  # Assuming 'num' is the ground truth label for risk

# Calculate metrics for the PCE model
brc_accuracy = accuracy_score(y_true, df_female['binary_risk_category'])
brc_precision = precision_score(y_true, df_female['binary_risk_category'])
brc_recall = recall_score(y_true, df_female['binary_risk_category'])
brc_f1 = f1_score(y_true, df_female['binary_risk_category'])
brc_auc = roc_auc_score(y_true, df_female['binary_risk_category'])


# Display comparison results
print("BRC Model Performance:")
print(f"Accuracy: {brc_accuracy}")
print(f"Precision: {brc_precision}")
print(f"Recall: {brc_recall}")
print(f"F1 Score: {brc_f1}")
print(f"AUC-ROC: {brc_auc}")

BRC Model Performance:
Accuracy: 0.7319587628865979
Precision: 0.4888888888888889
Recall: 0.88
F1 Score: 0.6285714285714286
AUC-ROC: 0.7802777777777777


<h1> Male </h1>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Ground truth labels for comparison
y_true = df_male['num_binary']  # Assuming 'num' is the ground truth label for risk

# Calculate metrics for the PCE model
brc_accuracy = accuracy_score(y_true, df_male['binary_risk_category'])
brc_precision = precision_score(y_true, df_male['binary_risk_category'])
brc_recall = recall_score(y_true, df_male['binary_risk_category'])
brc_f1 = f1_score(y_true, df_male['binary_risk_category'])
brc_auc = roc_auc_score(y_true, df_male['binary_risk_category'])


# Display comparison results
print("BRC Model Performance:")
print(f"Accuracy: {brc_accuracy}")
print(f"Precision: {brc_precision}")
print(f"Recall: {brc_recall}")
print(f"F1 Score: {brc_f1}")
print(f"AUC-ROC: {brc_auc}")

BRC Model Performance:
Accuracy: 0.6796116504854369
Precision: 0.6875
Recall: 0.7719298245614035
F1 Score: 0.7272727272727273
AUC-ROC: 0.6685736079328757
