In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.datasets import make_classification, make_regression
from ProQSAR.ModelDeveloper.model_developer import ModelDeveloper
from ProQSAR.Uncertainty.conformal_predictor import ConformalPredictor

def create_classification_data(
    n_samples=40, n_features=40, n_informative=10, random_state=42
    ):
    X, y = make_classification(n_samples=n_samples, n_features=n_features,
                                n_informative=n_informative, random_state=random_state)
    data = pd.DataFrame(X, columns=[f'Feature{i}' for i in range(1, n_features + 1)])
    data["ID"] = np.arange(n_samples)
    data["Activity"] = y
    return data

def create_regression_data(
    n_samples=40, n_features=40, n_informative=10, random_state=42
    ):
    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state)
    data = pd.DataFrame(X, columns=[f'Feature{i}' for i in range(1, n_features + 1)])
    data["ID"] = np.arange(n_samples)
    data["Activity"] = y
    return data

train_data = create_classification_data(random_state=42)
cal_data = create_classification_data(random_state=41)
test_data = create_classification_data(random_state=40)

In [2]:
from IPython.display import display
model = ModelDeveloper(activity_col="Activity", id_col="ID", select_model="KNeighborsClassifier", n_jobs=-1)
model.fit(train_data)
model.classes_
model.predict(test_data)

Unnamed: 0,ID,Predicted values,Probability for class 0,Probability for class 1
0,0,0,0.55,0.45
1,1,1,0.45,0.55
2,2,1,0.35,0.65
3,3,1,0.45,0.55
4,4,1,0.45,0.55
5,5,1,0.45,0.55
6,6,1,0.45,0.55
7,7,0,0.55,0.45
8,8,1,0.45,0.55
9,9,1,0.45,0.55


In [3]:
# class: calibrate(X, y, oob=False, mc=None, seed=None, class_cond=False, nc=<function hinge>)
# reggg: calibrate(X, y, oob=False, mc=None, seed=None, cps=False, de=None)

In [4]:
unc = ConformalPredictor(model=model, activity_col="Activity", id_col="ID")
unc.calibrate(data=cal_data)
unc.predict(data=cal_data, confidence=0.99)

Unnamed: 0,ID,Predicted set,P-value for class 0,P-value for class 1,Observed values
0,0,"0, 1",0.273232,0.758749,0
1,1,"0, 1",0.314813,0.757213,1
2,2,"0, 1",0.693554,0.41295,0
3,3,"0, 1",0.192946,0.873858,0
4,4,"0, 1",0.10538,0.895793,0
5,5,"0, 1",0.635676,0.477526,1
6,6,"0, 1",0.406674,0.74934,0
7,7,"0, 1",0.175624,0.813302,1
8,8,"0, 1",0.211999,0.827539,0
9,9,"0, 1",0.271925,0.745807,1


In [5]:
unc.predict(data=test_data, confidence=0.99)

Unnamed: 0,ID,Predicted set,P-value for class 0,P-value for class 1,Observed values
0,0,"0, 1",0.745012,0.273353,1
1,1,"0, 1",0.389699,0.825183,0
2,2,1,0.061596,0.97465,0
3,3,"0, 1",0.396129,0.751243,0
4,4,"0, 1",0.306272,0.733062,1
5,5,"0, 1",0.395895,0.809384,1
6,6,"0, 1",0.374681,0.778295,1
7,7,"0, 1",0.803666,0.304955,1
8,8,"0, 1",0.28023,0.68571,1
9,9,"0, 1",0.374064,0.739944,0


In [6]:
result = unc.evaluate(cal_data, confidence=[0.9, 0.95, 0.99])
display(result)

Unnamed: 0,0.90,0.95,0.99
error,0.075,0.075,0.025
avg_c,1.9,1.9,1.975
one_c,0.1,0.1,0.025
empty,0.0,0.0,0.0
time_fit,1.9e-05,1.9e-05,1.9e-05
time_evaluate,0.005265,0.001683,0.001322


## REGRESSION

In [7]:
reg_train_data = create_regression_data(random_state=42)
reg_cal_data = create_regression_data(random_state=41)
reg_test_data = create_regression_data(random_state=40)

model_reg = ModelDeveloper(activity_col="Activity", id_col="ID", select_model="KNeighborsRegressor", n_jobs=-1)
model_reg.fit(reg_train_data)
model_reg.predict(test_data)

Unnamed: 0,ID,Predicted values
0,0,-6.161565
1,1,-8.379141
2,2,44.879611
3,3,23.483242
4,4,115.076877
5,5,-84.124758
6,6,84.661054
7,7,38.251261
8,8,-32.792953
9,9,-5.912005


In [8]:
unc_reg = ConformalPredictor(model=model_reg,activity_col="Activity", id_col="ID")
unc_reg.calibrate(data=reg_cal_data)
unc_reg.predict(data=reg_cal_data)

Unnamed: 0,ID,Predicted values,Lower Bound,Upper Bound,Observed values
0,0,7.518468,-318.07487,333.111806,365.899172
1,1,-77.73299,-403.326328,247.860348,61.207526
2,2,49.32934,-276.263998,374.922677,229.617185
3,3,-89.221669,-414.815007,236.371669,-42.182753
4,4,77.442364,-248.150974,403.035702,-194.474185
5,5,76.388893,-249.204444,401.982231,29.149098
6,6,-26.232642,-351.82598,299.360696,-76.713444
7,7,16.687456,-308.905881,342.280794,-6.300805
8,8,45.659792,-279.933546,371.25313,-62.648477
9,9,18.995789,-306.597549,344.589126,-251.304905


In [9]:
unc_reg.predict(data=reg_test_data, confidence=0.99)

Unnamed: 0,ID,Predicted values,Lower Bound,Upper Bound,Observed values
0,0,-27.988923,-353.582261,297.604415,-54.166874
1,1,133.89623,-191.697108,459.489568,66.71388
2,2,108.093713,-217.499625,433.687051,-223.311602
3,3,30.488192,-295.105146,356.08153,-438.919188
4,4,40.71169,-284.881647,366.305028,-73.137329
5,5,2.298347,-323.294991,327.891684,321.578936
6,6,-48.555567,-374.148904,277.037771,164.945827
7,7,-69.564176,-395.157514,256.029162,321.462077
8,8,45.010447,-280.582891,370.603785,-1.291498
9,9,76.955709,-248.637629,402.549047,379.67369


In [10]:
reg_test_data.head(5)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,...,Feature33,Feature34,Feature35,Feature36,Feature37,Feature38,Feature39,Feature40,ID,Activity
0,-0.352509,0.603596,0.888519,-0.379498,0.456003,0.23156,0.224634,0.768409,0.573097,-0.186394,...,-1.517027,-0.73349,0.370729,-1.218588,0.303954,3.437449,-0.729998,-0.212551,0,-54.166874
1,-0.638448,0.106805,0.579245,-1.469121,-0.463197,0.69639,0.21973,1.435638,0.727683,-1.266901,...,0.882372,-2.479989,2.750117,0.470592,-0.93152,-2.25393,-1.26791,-0.844501,1,66.71388
2,0.127791,0.160145,0.201655,-0.305343,-0.553895,-0.01838,0.212288,0.915757,-0.020053,-0.293106,...,0.302212,0.218784,-0.982779,1.711772,-0.170051,1.551404,-0.317209,-1.092495,2,-223.311602
3,-1.730417,0.309116,-0.277753,-0.325738,0.092337,-0.169391,0.967418,0.513416,-2.183757,-1.061362,...,0.743496,-0.892318,0.575248,0.207556,1.396478,-0.769391,-1.124178,-2.178363,3,-438.919188
4,-1.16806,0.769628,1.142491,-0.383104,0.008148,-1.21353,-0.272442,-0.051291,-0.952321,-1.133724,...,0.135771,0.517249,-1.785718,-0.060118,0.577909,-0.478411,-1.144777,-0.494021,4,-73.137329


In [11]:
unc_reg.evaluate(reg_cal_data, confidence=[0.9, 0.95, 0.99])



Unnamed: 0,0.90,0.95,0.99
error,0.075,0.025,0.0
eff_mean,543.833098,651.186676,inf
eff_med,543.833098,651.186676,inf
time_fit,6e-06,6e-06,6e-06
time_evaluate,9.2e-05,5.2e-05,0.000343


In [12]:
no_y_data = reg_test_data.drop(columns='Activity', inplace=False)
no_y_data.head(5)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,...,Feature32,Feature33,Feature34,Feature35,Feature36,Feature37,Feature38,Feature39,Feature40,ID
0,-0.352509,0.603596,0.888519,-0.379498,0.456003,0.23156,0.224634,0.768409,0.573097,-0.186394,...,-1.950069,-1.517027,-0.73349,0.370729,-1.218588,0.303954,3.437449,-0.729998,-0.212551,0
1,-0.638448,0.106805,0.579245,-1.469121,-0.463197,0.69639,0.21973,1.435638,0.727683,-1.266901,...,1.274852,0.882372,-2.479989,2.750117,0.470592,-0.93152,-2.25393,-1.26791,-0.844501,1
2,0.127791,0.160145,0.201655,-0.305343,-0.553895,-0.01838,0.212288,0.915757,-0.020053,-0.293106,...,0.488229,0.302212,0.218784,-0.982779,1.711772,-0.170051,1.551404,-0.317209,-1.092495,2
3,-1.730417,0.309116,-0.277753,-0.325738,0.092337,-0.169391,0.967418,0.513416,-2.183757,-1.061362,...,-1.363525,0.743496,-0.892318,0.575248,0.207556,1.396478,-0.769391,-1.124178,-2.178363,3
4,-1.16806,0.769628,1.142491,-0.383104,0.008148,-1.21353,-0.272442,-0.051291,-0.952321,-1.133724,...,-0.542065,0.135771,0.517249,-1.785718,-0.060118,0.577909,-0.478411,-1.144777,-0.494021,4


In [13]:
unc_reg.evaluate(no_y_data, confidence=0.95)

KeyError: "'Activity' column is not found in the provided data. Please ensure that the data contains this column in order to use this function."