In [16]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from econml.grf import CausalForest

from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv("bsc_project_set.csv", index_col=0)

xs = data.copy()

y = xs["mort_28"]
w = xs["peep_regime"]

w = pd.Series(map(lambda x: 0 if x == "low" else 1, w))
y = pd.Series(map(lambda x: 1 if x == False else 0, y))

# xs = xs.drop(columns=["id", "mort_28", "peep", "peep_regime"])
selected_columns = ["age", "weight", "pf_ratio", "po2", "pco2", "driving_pressure", "bilirubin", "urea", "fio2", "plateau_pressure"]
xs = xs[selected_columns]
columns_x = xs.columns

norm_xs = (xs - xs.mean())/xs.std()

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imp_xs = imputer.fit_transform(norm_xs)

imp_xs = pd.DataFrame(data=imp_xs, columns=columns_x)

imp_xs

Unnamed: 0,age,weight,pf_ratio,po2,pco2,driving_pressure,bilirubin,urea,fio2,plateau_pressure
0,0.732971,0.183841,-0.026374,-0.049713,-1.422636,-0.364306,1.121030,-0.897592,-0.564826,0.705302
1,-1.005004,1.422974,-0.637005,-0.385550,-0.768253,-1.141620,2.210067,2.693985,0.758913,-0.620678
2,1.353676,-0.381057,-0.171336,-0.406111,-0.403630,1.468983,-0.012457,0.703662,-0.434729,0.317210
3,1.353676,0.100018,-0.319055,-1.169822,-0.403630,1.395651,-0.345836,1.738181,-1.503965,0.640620
4,0.050195,-0.654395,-0.577049,-0.737052,-0.334933,-1.072479,-0.531046,-0.720674,-0.385943,-1.236080
...,...,...,...,...,...,...,...,...,...,...
3936,0.919183,-1.128181,-0.711144,0.068026,-0.395043,1.219655,0.298696,0.478767,0.541000,0.252528
3937,0.732971,0.475402,-0.527518,-0.793421,-0.300585,-0.540301,-0.501413,2.250942,-0.411497,-1.028174
3938,0.236407,0.438957,-0.701828,-0.554080,0.137359,0.281012,0.325366,0.532742,1.377339,1.869576
3939,0.298477,0.245798,-0.425805,-0.814696,-0.156321,0.075683,-0.334723,-0.522017,0.150708,0.113924


In [3]:
full_data = imp_xs.assign(W=w, Y=y)

In [4]:
train, test = train_test_split(full_data, test_size=0.3)
train_x = train[columns_x]
test_x = test[columns_x]

In [32]:
lin_reg_treat = LinearRegression()
lin_reg_treat.fit(full_data[columns_x], full_data["W"])

forest = RandomForestClassifier(min_samples_leaf=20)
forest.fit(full_data[columns_x], full_data["W"])

In [33]:
prop_threshold = 0.01
pred_props_f = forest.predict_proba(full_data[columns_x])
# pred_props = lin_reg_treat.predict(full_data[columns_x])
# pred_violations = (pred_props < prop_threshold) | (pred_props > 1 - prop_threshold)
# pred_violations.tolist()
pred_props_f

array([[0.90345594, 0.09654406],
       [0.76767353, 0.23232647],
       [0.93348987, 0.06651013],
       ...,
       [0.50009462, 0.49990538],
       [0.94516669, 0.05483331],
       [0.97887312, 0.02112688]])

In [34]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(full_data[columns_x], full_data["W"])

In [35]:
pred_props_k = knn.predict_proba(full_data[columns_x])
# print(pred_props_k)
pred_props_k

array([[0.95, 0.05],
       [0.9 , 0.1 ],
       [0.9 , 0.1 ],
       ...,
       [0.6 , 0.4 ],
       [0.85, 0.15],
       [1.  , 0.  ]])

In [37]:
# pred_props_f.assign(pred_props_k)
con = np.concatenate([pred_props_f, pred_props_k], axis=1)
con

array([[0.90345594, 0.09654406, 0.95      , 0.05      ],
       [0.76767353, 0.23232647, 0.9       , 0.1       ],
       [0.93348987, 0.06651013, 0.9       , 0.1       ],
       ...,
       [0.50009462, 0.49990538, 0.6       , 0.4       ],
       [0.94516669, 0.05483331, 0.85      , 0.15      ],
       [0.97887312, 0.02112688, 1.        , 0.        ]])

In [39]:
violations = list(map(lambda row: 1 if (row[0] < prop_threshold and row[2] < prop_threshold) or (row[1] < prop_threshold and row[3] < prop_threshold) else 0, con))
violations

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [40]:
np.mean(violations)

0.03171783811215428