In [7]:
using DataFrames, Arrow, UMAP, StatsBase, MLJ

### data loading

In [2]:
celldyn = DataFrame(Arrow.Table("L:/laupodteam/AIOS/Bram/data/CellDyn/celldyn_FULL_transformed_df.feather"));

In [3]:
meas_cols = names(celldyn)[contains.(names(celldyn),r"c_b_|COMBO")];
suspect_cols = names(celldyn)[contains.(names(celldyn),r"^(c_)?s_")];
mode_cols = names(celldyn)[contains.(names(celldyn),"mode")];
alrt_cols = names(celldyn)[contains.(names(celldyn),"Alrt")];
other_c_cols = names(celldyn)[contains.(names(celldyn),r"^c_")];
usual_cols = unique(vcat(meas_cols,suspect_cols,mode_cols,alrt_cols,other_c_cols));

### select independent variables and age

In [9]:
X = celldyn[:,meas_cols]
y = celldyn.age;

X = dropmissing(X)
y = coerce(collect(skipmissing(y)),autotype(y));


In [6]:
using Random

In [10]:
idx = rand(1:size(X)[1],20000);
X_sample = X[idx,:];


In [11]:
pipe = (X -> coerce(X, autotype(X))) |> Standardizer()
pipe_mach = machine(pipe,X_sample)
fit!(pipe_mach)

┌ Info: Training machine(UnsupervisedPipeline(f = #1, …), …).
└ @ MLJBase C:\Users\hjoosse3\.julia\packages\MLJBase\6ooqv\src\machines.jl:496
┌ Info: Training machine(Standardizer(features = Symbol[], …), …).
└ @ MLJBase C:\Users\hjoosse3\.julia\packages\MLJBase\6ooqv\src\machines.jl:496


trained Machine; caches model-specific representations of data
  model: UnsupervisedPipeline(f = #1, …)
  args: 
    1:	Source @848 ⏎ Table{AbstractVector{Continuous}}


In [12]:
X_scaled = MLJ.transform(pipe_mach, X_sample);


PCA = @load PCA pkg="MultivariateStats"
pca = machine(PCA(maxoutdim = 2),X_scaled)

fit!(pca)

X_emb = MLJ.transform(pca,X_scaled);

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\hjoosse3\.julia\packages\MLJModels\K5pPR\src\loading.jl:159


import MLJMultivariateStatsInterface ✔


┌ Info: Training machine(PCA(maxoutdim = 2, …), …).
└ @ MLJBase C:\Users\hjoosse3\.julia\packages\MLJBase\6ooqv\src\machines.jl:496


In [13]:
using Distances

In [14]:
Dx = pairwise(Cityblock(),Matrix(X_sample),dims = 1);
Dy = pairwise(Cityblock(),Matrix(X_emb),dims = 1);

In [23]:
corr_dist(Dx,Dy)

0.13068142863309362

In [17]:
using RCall

R"library(energy)"; rcall(:dcor, X_emb,X_scaled)

RObject{RealSxp}
[1] 0.8969251


### Transform independent variables to normality

In [6]:
Standardizer = @load Standardizer pkg=MLJModels
LR = (@load LassoRegressor pkg=MLJLinearModels verbosity=0)
lr = LR()
XGB = (@load XGBoostRegressor pkg=XGBoost verbosity = 0)
xgb = XGB();


import MLJModels ✔

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\hjoosse3\.julia\packages\MLJModels\K5pPR\src\loading.jl:159





In [27]:
X = dropmissing(X)
y = coerce(collect(skipmissing(y)),autotype(y));

In [8]:
pipe = (X -> coerce(X, autotype(X))) |> Standardizer() |> lr

DeterministicPipeline(
  f = var"#1#2"(), 
  standardizer = Standardizer(
        features = Symbol[], 
        ignore = false, 
        ordered_factor = false, 
        count = false), 
  lasso_regressor = LassoRegressor(
        lambda = 1.0, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  cache = true)

In [10]:
range_lr = range(pipe,:(lasso_regressor.lambda);lower =0.001, upper = 10, scale = :log10)
tm_lr = TunedModel(
    model = pipe,
    range = range_lr,
    resampling = CV(nfolds = 10),
    measures=[RootMeanSquaredError(),RSquared()]
)
m = machine(tm_lr,X,y)

Machine trained 0 times; does not cache data
  model: DeterministicTunedModel(model = DeterministicPipeline(f = #1, …), …)
  args: 
    1:	Source @778 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @998 ⏎ `AbstractVector{Count}`


In [12]:
e = evaluate!(m,resampling = CV(nfolds = 5),measures = [rsquared,rmse])



PerformanceEvaluation object with these fields:
  measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows
Extract:
┌────────────────────────┬───────────┬─────────────┬─────────┬──────────────────
│[22m measure                [0m│[22m operation [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold       [0m ⋯
├────────────────────────┼───────────┼─────────────┼─────────┼──────────────────
│ RSquared()             │ predict   │ 0.218       │ 0.0956  │ [0.248, 0.0569, ⋯
│ RootMeanSquaredError() │ predict   │ 21.2        │ 1.8     │ [24.3, 20.1, 19 ⋯
└────────────────────────┴───────────┴─────────────┴─────────┴──────────────────
[36m                                                                1 column omitted[0m
