In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pathlib import Path
from typing import List

In [2]:
# gether results
def calculate_metrics(y_true: List[float], y_pred: List[float]):
    _y_true = np.array(y_true)
    _y_pred = np.array(y_pred)
    mae = mean_absolute_error(_y_true, _y_pred)
    rmse = np.sqrt(mean_squared_error(_y_true, _y_pred))
    r2 = r2_score(_y_true, _y_pred)
    return {
        "mae": mae,
        "rmse": rmse,
        "r2": r2
    }

def gather_results(results_dir: Path, model_name: str, is_direct: bool = False):
    results = []
    for split_method in results_dir.iterdir():
        if split_method.is_dir():
            for dataset_fold in split_method.iterdir():
                dataset_name = dataset_fold.name.split("_")[0]
                fold_name = dataset_fold.name.split("_")[1]
                for property in dataset_fold.iterdir():
                    if property.is_dir():
                        results_this = {
                            "dataset": dataset_name,
                            "split_method": split_method.name,
                            "fold": fold_name,
                            "property": property.name,
                            "model": model_name,
                            "is_direct": is_direct
                        }
                        if dataset_name == "consolidation":
                            df = pd.read_csv(property / "valid.csv")
                        else:
                            if is_direct:
                                df_train = pd.read_csv(property / "train.csv")
                                df_valid = pd.read_csv(property / "valid.csv")
                                df = pd.concat([df_train, df_valid])
                            else:
                                df = pd.read_csv(property / "valid.csv")
                            
                        labels = df["label"].tolist()
                        preds = df["predictions"].tolist()
                        results_metrics = calculate_metrics(labels, preds)
                        results_this.update(results_metrics)
                        results.append(results_this)
    return results

In [3]:
results_list = []

result_path = Path("../results")
model_name = "KPGT-Fluor"
is_direct = False
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

result_path = Path("../results_direct")
model_name = "KPGT-Fluor"
is_direct = True
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

KPGT-Fluor (indirect): 120
KPGT-Fluor (direct): 80


In [4]:
result_path = Path("../results_rf")
model_name = "rf"
is_direct = False
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

result_path = Path("../results_rf_direct")
model_name = "rf"
is_direct = True
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

rf (indirect): 120
rf (direct): 80


In [5]:
result_path = Path("../results_lightgbm")
model_name = "lightgbm"
is_direct = False
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

result_path = Path("../results_lightgbm_direct")
model_name = "lightgbm"
is_direct = True
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

lightgbm (indirect): 120
lightgbm (direct): 80


In [6]:
result_path = Path("../results_gbrt")
model_name = "gbrt"
is_direct = False
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

result_path = Path("../results_gbrt_direct")
model_name = "gbrt"
is_direct = True
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

gbrt (indirect): 120
gbrt (direct): 80


In [7]:
result_path = Path("../results_svr")
model_name = "svr"
is_direct = False
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

result_path = Path("../results_svr_direct")
model_name = "svr"
is_direct = True
results = gather_results(result_path , model_name, is_direct)
print(f"{model_name} ({'direct' if is_direct else 'indirect'}): {len(results)}")
results_list.extend(results)

svr (indirect): 120
svr (direct): 80


In [8]:
results_df = pd.DataFrame(results_list)
results_df.to_csv(f"../results.csv", index=False)

In [9]:
len(results_df)

1000

In [10]:
results_df.head()

Unnamed: 0,dataset,split_method,fold,property,model,is_direct,mae,rmse,r2
0,consolidation,scaffold,fold0,absorption,KPGT-Fluor,False,21.5669,33.268852,0.912601
1,consolidation,scaffold,fold0,emission,KPGT-Fluor,False,25.671684,36.180875,0.853202
2,consolidation,scaffold,fold0,quantum_yield,KPGT-Fluor,False,0.140766,0.199598,0.55019
3,consolidation,scaffold,fold0,log_molar_absorptivity,KPGT-Fluor,False,0.154325,0.225147,0.818386
4,cyanine,scaffold,fold0,absorption,KPGT-Fluor,False,17.968945,26.063522,0.942118
