In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error

# Load the saved model and tokenizer
model_path = "/Users/Ishini Fernando/Downloads/ishini/ishini/backend/final-tuned-model"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set pad token ID
model.config.pad_token_id = tokenizer.eos_token_id

def generate_next_hour(patient_info, model, tokenizer, top_k=5):
    # Tokenize the input text
    input_ids = tokenizer.encode(patient_info, return_tensors="pt")

    # Generate text for the next hour
    output = model.generate(input_ids,
                            max_length=250,
                            num_return_sequences=1,
                            temperature=0.7,
                            pad_token_id=tokenizer.eos_token_id,
                            top_k=top_k,
                            top_p=0.95,
                            do_sample=True,
                            num_beams=1)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract only the information for the next hour
    next_hour_info = generated_text.split(" at hour ")[-1]

    return next_hour_info

# Convert test data into sentence prompts
patients_test_data = pd.read_csv('/content/drive/My Drive/FYP/Test10%.csv').to_numpy()

paragraphs = []
for index, row in dpatients_test_data.iterrows():

    patient_id = row['ID']
    sbp = row['SBP']
    dbp = row['DBP']
    hr = row['HR']
    rr = row['RR']
    bt = row['BT']
    spo2 = row['SpO2']
    age = row['Age']
    gender = 'male' if row['Gender'] == 1 else 'female'
    na = row['Na']
    k = row['K']
    cl = row['Cl']
    urea = row['Urea']
    creatinine = row['Ceratinine']
    alcoholic = row['Alcoholic']
    smoke = row['Smoke']
    fhcd = row['FHCD']
    outcome = row['Outcome']
    hour = row['Hour']

    sentence = f"Patient of Patient ID {patient_id} is {'not ' if smoke == 0 else ''}a smoker and {'not ' if alcoholic == 0 else ''}an alcoholic {gender} with {'no ' if fhcd == 0 else ''}Family History of Ischemic Heart Diseases at age {age} has {sbp} mmHg Systolic Blood Pressure, {dbp} mmHg Diastolic Blood Pressure, {hr} beats per minute Heart Rate, {rr} breaths per minute Respiratory Rate, {bt} fahrenheit Body temperature , {spo2} mEq/L SpO2, {na} mEq/L Sodium Level, {k} mEq/L Potassium Level , {cl} mEq/L Chloride Level, {urea} mg/dL Urea, {creatinine} mg/dL Creatinine at hour {hour} is {'at critical risk of death.' if outcome == 0 else 'on the path of recovery with continued treatment.'}"

    paragraphs.append(sentence)

output_df = pd.DataFrame({'Paragraph': paragraphs})

# Calculate metrics
predictions = []
targets = []
for data in output_df:
    input_text = data["input"]
    target_output = data["target"]

    # Generate prediction for input text
    prediction = generate_next_hour(input_text, model, tokenizer)

    # Store prediction and target
    predictions.append(prediction)
    targets.append(target_output)

# Calculate metrics
accuracy = accuracy_score(targets, predictions)
precision = precision_score(targets, predictions, average='weighted')
recall = recall_score(targets, predictions, average='weighted')
f1 = f1_score(targets, predictions, average='weighted')
mse = mean_squared_error(targets, predictions)
mae = mean_absolute_error(targets, predictions)

print("\n********Measures of********  ", model)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)


********Measures of********  CETU-LMM
Mean Squared Error (MSE): 0.2989734898653294
Mean Absolute Error (MAE): 0.3098112095428476

********Measures of********  CETU-LMM-01
Mean Squared Error (MSE): 0.3020799877642517
Mean Absolute Error (MAE): 0.3211958622944869

********Measures of********  CETU-LMM-02
Mean Squared Error (MSE): 0.30016653332167265
Mean Absolute Error (MAE): 0.17830026778925781

********Measures of********  CETU-LMM03
Mean Squared Error (MSE): 0.30016653332167265
Mean Absolute Error (MAE): 0.17830026778925781


In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error

# Experiment data set. Import each experiment data set separately
patients_training_data = pd.read_csv('/content/drive/My Drive/FYP/ExperimentNo1Data.csv').to_numpy()
patients_test_data = pd.read_csv('/content/drive/My Drive/FYP/Test10%.csv').to_numpy()

x_train = patients_training_data[:, :-2]
y_train = patients_training_data[:, -1]

x_test = patients_test_data[:, :-2]
y_test = patients_test_data[:, -1]

def accuracy_metrics(trueLabel, predlabel, name):
    print('\n********Measures of******** ', name)
    con = confusion_matrix(trueLabel, predlabel)
    total = sum(sum(con))
    print('accuracy =', accuracy_score(trueLabel, predlabel))
    print('sensitivity =', recall_score(trueLabel, predlabel))
    print('specificity =', con[1, 1] / (con[1, 1] + con[0, 1]))
    print('ppv =', precision_score(trueLabel, predlabel))
    print('npv =', con[1, 1] / (con[0, 1] + con[1, 1]))
    print('fscore =', f1_score(trueLabel, predlabel))
    print('Mean Squared Error (MSE) =', mean_squared_error(trueLabel, predlabel))
    print('Mean Absolute Error (MAE) =', mean_absolute_error(trueLabel, predlabel))

# Logistic Regression Model
lin_model = LogisticRegression(solver='lbfgs')
lin_model.fit(x_train, y_train)
lmodel_pred = lin_model.predict(x_test)
print(confusion_matrix(y_test, lmodel_pred))
print("Linear Model Accuracy: ", lin_model.score(x_test, y_test))
accuracy_metrics(y_test, lmodel_pred, 'Logistic Regression')

# Support Vector Machine Model
svm_model = SVC(gamma='auto')
svm_model.fit(x_train, y_train)
svm_pred = svm_model.predict(x_test)
print("Support Vector Machine Model Accuracy: ", svm_model.score(x_test, y_test))
accuracy_metrics(y_test, svm_pred, 'SVM Model')

# Decision Tree Model
tree_model = DecisionTreeClassifier()
tree_model.fit(x_train, y_train)
tree_pred = tree_model.predict(x_test)
print("Decision Tree Model Accuracy: ", tree_model.score(x_test, y_test))
accuracy_metrics(y_test, tree_pred, 'Decision Tree')

# Random Forest Model
forest_model = RandomForestClassifier(n_estimators=100)
forest_model.fit(x_train, y_train)
forest_pred = forest_model.predict(x_test)
print("Random Forest Model Accuracy: ", forest_model.score(x_test, y_test))
accuracy_metrics(y_test, forest_pred, 'Random Forest')

Linear Model Accuracy:  0.7372881355932204

********Measures of********  Logistic Regression
accuracy = 0.8813559322042934
sensitivity = 0.95
specificity = 0.8135593220491723
ppv = 0.9881355932204661
npv = 0.8144067797518893
fscore = 0.86487804878048786
Mean Squared Error (MSE) = 0.2793301946793201
Mean Absolute Error (MAE) = 0.2992004791247734
Support Vector Machine Model Accuracy:  0.8813559322042934

********Measures of********  SVM Model
accuracy = 0.8925537682016845
sensitivity = 0.82
specificity = 0.0.8240511317982054
ppv = 0.8115036888305744
npv = 0.8190043769467233
fscore = 0.8440668834792169
Mean Squared Error (MSE) = 0.2627118644067797
Mean Absolute Error (MAE) = 0.2864400962747697
Support Vector Machine Model Accuracy:   0.8925537682016845

********Measures of********  Decision Tree
accuracy = 0.7640029337581102
sensitivity = 0.69
specificity = 0.80833690541934861
ppv = 0.7226118340104471
npv = 0.7893002168835449
fscore = 0.8002926630171495
Mean Squared Error (MSE) = 0.30079