In [1]:
using DataFrames, Arrow, UMAP, StatsBase, MLJ

### data loading

In [2]:
celldyn = DataFrame(Arrow.Table("L:/laupodteam/AIOS/Bram/data/CellDyn/celldyn_FULL_transformed_df.feather"));

In [3]:
meas_cols = names(celldyn)[contains.(names(celldyn),r"c_b_|COMBO")];
suspect_cols = names(celldyn)[contains.(names(celldyn),r"^(c_)?s_")];
mode_cols = names(celldyn)[contains.(names(celldyn),"mode")];
alrt_cols = names(celldyn)[contains.(names(celldyn),"Alrt")];
other_c_cols = names(celldyn)[contains.(names(celldyn),r"^c_")];
usual_cols = unique(vcat(meas_cols,suspect_cols,mode_cols,alrt_cols,other_c_cols));

### select independent variables and age

In [4]:
X = celldyn[:,meas_cols]
y = celldyn.gender;

X = dropmissing(X)
y = coerce(collect(skipmissing(y)),autotype(y));


In [5]:
using Random

In [17]:
idx = rand(1:size(X)[1],20000);
X_sample = X[idx,:];
y_sample = y[idx];


In [18]:
pipe = (X -> coerce(X, autotype(X))) |> Standardizer()
pipe_mach = machine(pipe,X_sample)
fit!(pipe_mach)

┌ Info: Training machine(UnsupervisedPipeline(f = #3, …), …).
└ @ MLJBase C:\Users\hjoosse3\.julia\packages\MLJBase\6ooqv\src\machines.jl:496
┌ Info: Training machine(Standardizer(features = Symbol[], …), …).
└ @ MLJBase C:\Users\hjoosse3\.julia\packages\MLJBase\6ooqv\src\machines.jl:496


trained Machine; caches model-specific representations of data
  model: UnsupervisedPipeline(f = #3, …)
  args: 
    1:	Source @750 ⏎ Table{AbstractVector{Continuous}}


In [56]:
using Distances

In [70]:
X_scaled = MLJ.transform(pipe_mach, X_sample);
X_emb = umap(transpose(Matrix(X_scaled)),6;n_neighbors=15,min_dist = 0.1,metric = Cityblock())

6×20000 Matrix{Float64}:
 -2.17979    -1.92809   -1.00595   …  -1.59084    -1.50595    -2.09578
 -1.73072    -2.97287    2.03381      -0.398578   -1.3174     -2.17657
  1.17996     2.90354   -0.696916      0.0568828   0.0960614   0.697902
  1.06055     0.945624   0.344112      0.711543    0.786759    1.00545
  0.0580881  -0.722419  -0.121642     -0.0778758   1.51413     0.590818
 -4.5099     -3.84674   -2.43788   …  -3.91734    -3.47689    -3.83299

In [71]:
X_scaled = MLJ.transform(pipe_mach, X_sample);

PCA = @load UMAP pkg="MultivariateStats"
pca = machine(PCA(maxoutdim = 6),X_scaled)

fit!(pca)

X_emb = MLJ.transform(pca,X_scaled);

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\hjoosse3\.julia\packages\MLJModels\K5pPR\src\loading.jl:159


LoadError: ArgumentError: There is no model named "UMAP" in the registry. 
 Run `models()` to view all registered models, or `models(needle)` to restrict search to models with string `needle` in their name or documentation. 

In [9]:
using Plots
pyplot()

Plots.PyPlotBackend()

In [86]:
using CategoricalArrays
cut(celldyn[idx,"age"],10)

20000-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "Q3: [20.0, 33.0)"
 "Q4: [33.0, 43.0)"
 "Q2: [5.0, 20.0)"
 "Q6: [51.0, 57.0)"
 "Q7: [57.0, 63.0)"
 "Q9: [68.0, 75.0)"
 "Q10: [75.0, 99.0]"
 "Q1: [0.0, 5.0)"
 "Q3: [20.0, 33.0)"
 "Q8: [63.0, 68.0)"
 "Q6: [51.0, 57.0)"
 "Q4: [33.0, 43.0)"
 "Q2: [5.0, 20.0)"
 ⋮
 "Q7: [57.0, 63.0)"
 "Q2: [5.0, 20.0)"
 "Q5: [43.0, 51.0)"
 "Q10: [75.0, 99.0]"
 "Q7: [57.0, 63.0)"
 "Q4: [33.0, 43.0)"
 "Q3: [20.0, 33.0)"
 "Q7: [57.0, 63.0)"
 "Q10: [75.0, 99.0]"
 "Q7: [57.0, 63.0)"
 "Q9: [68.0, 75.0)"
 "Q4: [33.0, 43.0)"

### Transform independent variables to normality

In [115]:
LR = (@load LogisticClassifier pkg=MLJLinearModels verbosity=0)
lr = LR()
XGB = (@load XGBoostClassifier pkg=XGBoost verbosity = 0)
xgb = XGB();


In [139]:
X = dropmissing(X)
y = [x == "M" ? "M" : "F" for x in y];
y = coerce(collect(skipmissing(y)),autotype(y));


In [140]:
pipe = (X -> coerce(X, autotype(X))) |> Standardizer() |> lr

ProbabilisticPipeline(
  f = var"#17#18"(), 
  standardizer = Standardizer(
        features = Symbol[], 
        ignore = false, 
        ordered_factor = false, 
        count = false), 
  logistic_classifier = LogisticClassifier(
        lambda = 1.0, 
        gamma = 0.0, 
        penalty = :l2, 
        fit_intercept = true, 
        penalize_intercept = false, 
        scale_penalty_with_samples = true, 
        solver = nothing), 
  cache = true)

In [141]:
range_lr = range(pipe,:(logistic_classifier.lambda);lower =0.001, upper = 10, scale = :log10)
tm_lr = TunedModel(
    model = pipe,
    range = range_lr,
    resampling = CV(nfolds = 10),
    measures=[Accuracy(),AUC()]
)
m = machine(tm_lr,X,y)

untrained Machine; does not cache data
  model: ProbabilisticTunedModel(model = ProbabilisticPipeline(f = #17, …), …)
  args: 
    1:	Source @498 ⏎ Table{AbstractVector{Continuous}}
    2:	Source @691 ⏎ AbstractVector{Multiclass{2}}


In [142]:
e = evaluate!(m,resampling = CV(nfolds = 5),measures = [Accuracy(),AUC()])