Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regular merge of develop #149

Merged
merged 4 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "GigaSOM"
uuid = "a03a9c34-069e-5582-a11c-5c984cab887c"
version = "0.5.1"
version = "0.6.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand All @@ -17,6 +17,7 @@ NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"

Expand Down
3 changes: 1 addition & 2 deletions src/GigaSOM.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@ The documentation is here: http://LCSB-BioCore.github.io/GigaSOM.jl

module GigaSOM

using CSV
using DataFrames
using Distances
using Distributed
using Distributions
using FCSFiles
using FileIO
using DistributedArrays
using XLSX
using NearestNeighbors
using Serialization
using StableRNGs

include("base/structs.jl")

Expand Down
105 changes: 65 additions & 40 deletions src/analysis/core.jl
Original file line number Diff line number Diff line change
@@ -1,61 +1,86 @@
"""
initGigaSOM(train, xdim, ydim = xdim)
initGigaSOM(data, args...)

Initializes a SOM by random selection from the training data.
Initializes a SOM by random selection from the training data. A generic
overload that works for matrices and DataFrames that can be coerced to
`Matrix{Float64}`. Other arguments are passed to the data-independent
`initGigaSOM`.

# Arguments:
- `train`: codeBook vector as random input matrix from random workers
- `xdim, ydim`: geometry of the SOM
Arguments:
- `data`: matrix of data for running the initialization
"""
function initGigaSOM(train, xdim::Int64, ydim::Int64 = xdim)
function initGigaSOM(data::Union{Matrix,DataFrame}, args...; kwargs...)

if typeof(train) == DataFrame
colNames = [String(x) for x in names(train)]
else
colNames = ["x$i" for i = 1:size(train, 2)]
@debug "assuming default colNames"
end
d = Matrix{Float64}(data)

train = Matrix{Float64}(train)
(n, ncol) = size(d)
means = [sum(d[:, i]) / n for i = 1:ncol]
sdevs = [sqrt(sum((d[:, i] .- means[i]) .^ 2.0) / n) for i = 1:ncol]

numCodes = xdim * ydim
return initGigaSOM(ncol, means, sdevs, args...; kwargs...)
end

# initialise the codes with random samples
codes = train[rand(1:size(train, 1), numCodes), :]
grid = gridRectangular(xdim, ydim)
"""
function initGigaSOM(data::LoadedDataInfo,
xdim::Int64, ydim::Int64 = xdim;
seed=rand(Int), rng=StableRNG(seed))

# make SOM object:
som = Som(
codes = codes,
colNames = colNames,
xdim = xdim,
ydim = ydim,
numCodes = numCodes,
grid = grid,
)
return som
`initGigaSOM` overload for working with distributed-style `LoadedDataInfo`
data. The rest of the arguments is passed to the data-independent
`initGigaSOM`.

Arguments:
- `data`: a `LoadedDataInfo` object with the distributed dataset matrix
"""
function initGigaSOM(data::LoadedDataInfo, args...; kwargs...)
ncol = get_val_from(data.workers[1], :(size($(data.val))[2]))
(means, sdevs) = dstat(data, Vector(1:ncol))

initGigaSOM(ncol, means, sdevs, args...; kwargs...)
end

"""
initGigaSOM(trainInfo::LoadedDataInfo,
xdim::Int64, ydim :: Int64 = xdim)
function initGigaSOM(ncol::Int64,
means::Vector{Float64}, sdevs::Vector{Float64},
xdim::Int64, ydim::Int64 = xdim;
seed = rand(Int), rng = StableRNG(seed))

`initGigaSOM` overload for working with distributed-style `LoadedDataInfo`
data. The rest of arguments is the same as in `initGigaSOM`.
Generate a stable random initial SOM with the random distribution that matches the parameters.

Arguments:
- `ncol`: number of desired data columns
- `means`, `sdevs`: vectors that describe the data distribution, both of size `ncol`
- `xdim`, `ydim`: Size of the SOM
- `seed`: a seed (defaults to random seed from the current default random generator
- `rng`: a random number generator to be used (defaults to a `StableRNG` initialized with the `seed`)

Note that this function only uses the data saved on the first worker for
initialization, and the init work is actually done on that worker to avoid
unnecessary data copying.
Returns: a new `Som` structure
"""
function initGigaSOM(trainInfo::LoadedDataInfo, xdim::Int64, ydim::Int64 = xdim)
function initGigaSOM(
ncol::Int64,
means::Vector{Float64},
sdevs::Vector{Float64},
xdim::Int64,
ydim::Int64 = xdim;
seed = rand(UInt),
rng = StableRNG(seed),
)

# Snatch the init data from the first available worker (for he cares not).
return get_val_from(
trainInfo.workers[1],
:(initGigaSOM($(trainInfo.val), $xdim, $ydim)),
)
numCodes = xdim * ydim
grid = gridRectangular(xdim, ydim)

# Initialize with an unbiased random gaussian with same mean/sd as the data
# in each dimension
codes = randn(rng, (numCodes, ncol))
for col = 1:ncol
codes[:, col] .*= sdevs[col]
codes[:, col] .+= means[col]
end

return Som(codes = codes, xdim = xdim, ydim = ydim, grid = grid)
end


"""
trainGigaSOM(som::Som, dInfo::LoadedDataInfo;
kernelFun::Function = gaussianKernel,
Expand Down
18 changes: 11 additions & 7 deletions src/base/dataops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,15 @@ function combine_stats((s1, sqs1, n1), (s2, sqs2, n2))
end

"""
dstat(dInfo::LoadedDataInfo, columns::Vector{Int})
dstat(dInfo::LoadedDataInfo, columns::Vector{Int})::Tuple{Vector{Float64}, Vector{Float64}}

Compute mean and standard deviation of the columns in dataset. Returns a tuple
with a vector of means in `columns`, and a vector of corresponding sdevs.
"""
function dstat(dInfo::LoadedDataInfo, columns::Vector{Int})
function dstat(
dInfo::LoadedDataInfo,
columns::Vector{Int},
)::Tuple{Vector{Float64},Vector{Float64}}

sum_squares = x -> sum(x .^ 2)

Expand All @@ -107,22 +110,23 @@ function dstat(dInfo::LoadedDataInfo, columns::Vector{Int})
(sums, sqsums, ns) = distributed_mapreduce(dInfo, get_stats, combine_stats)

return (
sums ./ ns, #means
sqrt.(sqsums ./ ns - (sums ./ ns) .^ 2), #sdevs
(sums./ns)[1, :], #means
(sqrt.(sqsums ./ ns - (sums ./ ns) .^ 2))[1, :], #sdevs
)
end

"""
dstat_buckets(dInfo::LoadedDataInfo, nbuckets::Int, buckets::LoadedDataInfo, columns::Vector{Int})
dstat_buckets(dInfo::LoadedDataInfo, nbuckets::Int, buckets::LoadedDataInfo, columns::Vector{Int})::Tuple{Matrix{Float64}, Matrix{Float64}}

A version of `dstat` that works with bucketing information (e.g. clusters).
A version of `dstat` that works with bucketing information (e.g. clusters);
returns a tuple of matrices.
"""
function dstat_buckets(
dInfo::LoadedDataInfo,
nbuckets::Int,
buckets::LoadedDataInfo,
columns::Vector{Int},
)
)::Tuple{Matrix{Float64},Matrix{Float64}}
# this produces a triplet of matrices (1 row per each bucket)
get_bucketed_stats =
(d, b) -> (
Expand Down
19 changes: 8 additions & 11 deletions src/base/structs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Structure to hold all data of a trained SOM.

# Fields:
- `codes::Array{Float64,2}`: 2D-array of codebook vectors. One vector per row
- `colNames::Array{String,1}`: names of the attribute with which the SOM is trained
- `xdim::Int`: number of neurons in x-direction
- `ydim::Int`: number of neurons in y-direction
- `numCodes::Int`: total number of neurons
Expand All @@ -14,21 +13,19 @@ Structure to hold all data of a trained SOM.
3 columns (x,y,z) for spherical maps)
"""
mutable struct Som
codes::Array{Float64,2}
colNames::Array{String}
codes::Matrix{Float64}
xdim::Int
ydim::Int
numCodes::Int
grid::Array{Float64,2}
grid::Matrix{Float64}

Som(;
codes::Array{Float64} = Array{Float64}(0),
colNames::Array{String,1} = Array{String}(0),
xdim::Int = 1,
ydim::Int = 1,
numCodes::Int = 1,
grid::Array{Float64,2} = zeros(1, 1),
) = new(codes, colNames, xdim, ydim, numCodes, grid)
codes::Matrix{Float64},
xdim::Int,
ydim::Int,
numCodes::Int = xdim * ydim,
grid::Matrix{Float64},
) = new(codes, xdim, ydim, numCodes, grid)
end

"""
Expand Down
7 changes: 7 additions & 0 deletions test/refData/convert-gen-to-ref.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh
head -n 11 ../genData/batchWinners.csv > refBatchWinners.csv
head -n 11 ../genData/parallelWinners.csv > refParallelWinners.csv
head -n 11 ../genData/batchEmbedded.csv > refBatchEmbedded.csv
head -n 11 ../genData/parallelEmbedded.csv > refParallelEmbedded.csv
head -n 11 ../genData/batchDfCodes.csv > refBatchDfCodes.csv
head -n 11 ../genData/parallelDfCodes.csv > refParallelDfCodes.csv
20 changes: 10 additions & 10 deletions test/refData/refBatchDfCodes.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7
-0.011651251029458529,0.002166440919825469,-0.016107773228903503,-0.02267979761084743,0.011138154608847282,0.00367645725668631,-0.008512117853380383,-0.025609555046142884,0.008291097574972593,-0.013889722801698756
-0.015951824546522195,-0.002697429870211619,-0.022851480129853097,-0.02338515467852531,0.007879183676403521,0.0033409250992698543,-0.010610960736475918,-0.025460157248426617,0.005161131828402003,-0.011165165902673387
-0.018460099430044313,-0.0053348922680251645,-0.027923343496897145,-0.02387883890843739,0.005214373295028115,0.0031519614803055137,-0.01374838399229439,-0.024558407368225888,0.0012409770364297595,-0.0062790908403473475
-0.020757029484934443,-0.008322760653328196,-0.03196278126936008,-0.023745826001870852,0.0033909949414117494,0.0037307918688530904,-0.01626242746857233,-0.022959834566592886,-0.001965983279393479,-0.0019169115424721547
-0.022995554919990426,-0.010606081583462008,-0.03427623370658472,-0.024576402052169207,0.0013448984967097538,0.004349501067549502,-0.016997607712597014,-0.02179101982437115,-0.004980685900616922,0.0015353782222695895
-0.024077115259944812,-0.011714510684187556,-0.03575164678974174,-0.02496006999043605,-0.0005615706517413436,0.005127613251056575,-0.016589968554867322,-0.02196648608402114,-0.008080566306154223,0.005078697268390525
-0.02280491786388036,-0.012088734144116473,-0.03502474167057429,-0.024776972198643336,-0.0030730612729294106,0.004979240652566521,-0.01736939673631981,-0.021418503939703726,-0.012416538137455671,0.008222161851395175
-0.021361705117632003,-0.010758591549803553,-0.03405254052307793,-0.023824976347450493,-0.004499831340097048,0.004694851720409449,-0.016685463945211376,-0.019512632412737783,-0.015737142966254614,0.01338769742196532
-0.019292562591419423,-0.009278789640504858,-0.03135687794812301,-0.022962237895561736,-0.006729952834781199,0.002878891330281074,-0.016658210623838203,-0.017903387380575037,-0.020056421918357603,0.019975333140029673
-0.016247269914707884,-0.00740984542325823,-0.028062505860498892,-0.02231750214052588,-0.009455430561750876,0.000786166616881858,-0.01656328758034033,-0.01601653921372662,-0.024373583393098794,0.025644294109764914
-0.020661387816701828,-0.016161120779088663,-0.01249623627759775,-0.0034145410849788877,0.014807586926559653,0.0046715057568052516,0.01579385076647087,0.0009447735391436881,0.02190928173933184,-0.015697400125440446
-0.02005093062889222,-0.014269035817494969,-0.013039351840076645,-0.002985186876171628,0.009609145977086965,0.00702660927384001,0.01650154435760052,0.0012497263037976013,0.018304252246390512,-0.01903168396592879
-0.018788471362745307,-0.011674867151168998,-0.013892318528007385,-0.0028942218598770036,0.005187724890544429,0.009408032474714481,0.01738428612136856,0.0008292768855502434,0.014789337846995957,-0.022451985720898553
-0.017586974966765687,-0.009684792719596318,-0.014788360098695824,-0.002844458402612456,0.0011821276775411634,0.011058966466700413,0.016998801810468454,0.0001696701916532979,0.010787772791408978,-0.025702596416148988
-0.021236637394984326,-0.010443009847073298,-0.016317806826461025,-0.001040240702235489,0.0017567656911396936,0.016129470212235868,0.019983499116639214,0.0024218829121462012,0.01348036066165613,-0.027178442918584813
-0.02323930293506252,-0.010343130170832203,-0.0160472130856683,0.0005814235461290909,0.0022876975871033013,0.018752355039775046,0.022050505097252898,0.003382194826886934,0.014486413381663501,-0.027242157143555618
-0.022565262666134848,-0.008067661973959712,-0.014523685185202789,0.0017885539541355761,0.002905441205575069,0.019318470482428095,0.023260382337895385,0.0038037239985614377,0.014250101736801073,-0.026384829504991988
-0.021386868010550755,-0.006582720125070556,-0.013742927636424877,0.0025294938554544506,0.002019437034627474,0.020238852219692446,0.02359309239589711,0.004139563745452907,0.013154023718769371,-0.022919034661042337
-0.019685222773768392,-0.005260420938398561,-0.01263897655387842,0.0018577552418864714,0.0007079395479014865,0.020240301907265355,0.023509393021684127,0.0031884247090666435,0.010853678978840937,-0.019119186692227116
-0.01817652298806144,-0.004568729107711974,-0.012247247192667432,0.0019481924139792522,-0.0017780017327565826,0.02090332452866241,0.020879877315333684,0.002788258677921433,0.007710635361434031,-0.014495702832397588
20 changes: 10 additions & 10 deletions test/refData/refBatchEmbedded.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
x1,x2
1.5001213674422897,8.880687995083614
1.2237981567666019,8.782808680684086
5.6341242988459594,0.003916876256086727
0.34120940110869247,7.976589652696513
7.414592953032319,0.12429919116488143
3.5053583626488267,0.08500795962385448
8.741142530996362,4.582317518346491
8.781770646925953,1.1656467549268985
9.15192688427219,0.07446585923210677
4.115778473902338,0.0563890768039573
7.853292848987698,8.720410251312108
7.76525592228932,8.764961186548483
0.05420208119934999,7.128494184649301
7.662565234267279,8.789346146054893
0.2313774377228306,7.992453408932434
6.718647228133474,8.71364554593444
7.310434122948076,8.85349622911063
1.1640093106430396,8.780329024131897
7.085379535308563,0.12394846652869095
5.982920181010032,4.171484706899294
12 changes: 6 additions & 6 deletions test/refData/refBatchWinners.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
index
92
100
100
91
6
99
91
97
99
91
10
4
10
10
6
5
20 changes: 10 additions & 10 deletions test/refData/refParallelDfCodes.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7
-0.17640967142561337,0.4685075475625074,-0.563492455862733,-0.22178816350494124,0.5017208417273646,-0.03261443423876599,-0.3529187887429754,-0.290477457805388,0.8104538714781182,-0.998356121477298
-0.7831021091922231,-0.1387601640904325,-0.7141402998872505,-0.1543993358965959,-0.023269147383249467,0.1584326532614666,-0.022852972837362744,-0.2932731477925736,0.5757650485172153,-1.0812869624104249
0.008927988103811249,0.4091058632556585,-0.8722452167373481,-0.4585758423921651,-0.40986988557257265,-0.36638465240840806,-0.5493908695156264,-0.37945184407422816,-0.2676889658062833,-0.9016287629952086
-0.4110168375156833,-0.24935670477094934,-0.8579259483467333,0.4579877893841562,0.49735708414528346,-0.0011058499284762261,-0.6880093365699184,-0.17897455589509492,0.3289355183541678,-0.4719020523756622
-1.0226094296799497,-0.9840566016813554,-0.7097172391199558,-0.16485480747210984,0.11492309924943464,0.22047196454693038,-0.502125790229743,-0.02049200390243572,0.33706614752231373,-0.894516830707265
-1.2095810804913587,-1.1896783882789272,-0.81483190688843,-0.35858500308536834,-0.0625575067983978,0.6661787916264342,0.144441488758504,-0.23851684605223872,0.043665352825085546,-0.3727782223859913
-0.642941166764806,-1.5094528640377423,-0.7536810820703415,-0.37265948389744713,-0.4882089819026275,0.01753753455821686,-0.44332239042127136,-0.2862474935367596,-0.606802065413238,-0.6506740054231588
-0.5723070637131475,-0.23835188752416503,-0.9230290505068274,-0.4248295627471601,-0.2902891415288133,1.0892987177717286,0.11766517398469416,-0.20569072343465256,-0.2609366770976574,0.43842780024682604
-1.1410063412612652,-0.541407804316432,-0.8366965863280671,-0.30047779713732037,0.08012450796075446,0.13989080364661932,-0.0348520560461668,-0.27298795522471936,-0.09480486991814785,0.9046371040902668
-0.32086931489919535,0.008572799776121031,-0.7726234805473643,-0.31124537949637376,-0.3731914270086051,-0.2715889271941862,-0.3089612139429518,-0.2100110845003733,-0.43372445501215534,0.9200262143737115
-0.8018623096228251,-0.8369378251602092,-0.42235875792375416,0.03181298171449356,1.1023176364122644,-0.1963262502082471,0.14292286933337084,-0.04495525968748685,0.9488704015837622,-0.9502693235574208
-0.5625521973628088,-2.0296778784502623,-0.35526879305589104,-0.3107117523629649,-0.691540155508728,-0.4093496225834384,0.4523724271046617,-0.009974066523155774,-0.4068308049793661,-0.6095350731407303
0.5276213762346692,-0.13928455906903522,0.0260388772472196,-0.5305729781019549,-0.3946096763960851,-0.5826133567572871,1.261368184898035,-0.37813700493775537,0.6684088483586492,-0.5842763194864915
-0.49203525558430333,0.006125170365176934,-0.7177418471018223,-0.34790913951628977,-0.44641248247014126,-0.40822054375201805,-0.19201882406973556,-0.3264692598105841,-0.4422401708848413,-1.0204324390416406
-0.7946172931478637,-0.04379896708998755,-0.8236749140239338,-0.38413984385557476,-0.4578755791225342,-0.006919619778465148,-0.1544742520669272,-0.24564204881872534,-0.14253950453684816,-1.0500842859108013
-1.0486033461015627,-0.8956741167306301,-0.6379896360088917,-0.12698033338253267,-0.2186917578539655,1.2213823663626238,0.32907333598261723,-0.12740051860331436,0.32155185312345375,-0.956097609243189
-0.5974402427459592,0.39237883711545724,-0.07883772208890998,0.4015202131734113,0.8479696160880946,0.8049105112428624,1.1140073920103057,-0.09581685765991838,1.033044071876181,-1.174732014176624
-0.7872516308600919,0.21897600381375384,-0.4402878230721225,2.163471850204101,0.5809775152776391,0.7979704573811573,0.5718975450381883,2.3417189247773846,1.5471819106506124,-0.9461146224865068
0.039924351634507416,0.3443437618857008,-0.13265299964277888,-0.08261247799182267,-0.039537534454404076,0.5981065374507042,2.2159468572244885,-0.01778470058774384,0.13892743607012503,0.14216618054090033
-0.5915097660623094,0.07164712471285221,-0.6202069574325458,0.2942533083281945,-0.371048038863417,1.571561436383151,0.5579442193175757,0.5739029477810871,-0.07997738337816146,0.41916332336585954
20 changes: 10 additions & 10 deletions test/refData/refParallelEmbedded.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
x1,x2
1.6744019997736426,8.103683903608214
1.7010846112311093,8.401485398046395
7.015264642225017,3.5344531046086956
2.7854112914578004,6.833764156606785
8.544287420382766,0.8639031797261263
7.203534969749915,5.661315702180364
8.155603686885433,2.180030431709865
8.412521380846723,2.1079805646140874
9.029942763218518,4.162009375605646
2.8330767297731123,2.1536182096362584
8.019707426546399,7.484522208408344
6.635774256956503,7.521585380705817
2.7931129447029233,5.013450509762322
3.3489279330827224,5.592905722888017
3.4888898275483258,8.60900213852842
5.5620028677248365,5.334199616261413
5.307053172716869,8.604732622951328
3.8286893566636997,8.88841904590264
5.737121198824499,2.8540970674331145
2.567787983594243,1.6517987546509052
18 changes: 9 additions & 9 deletions test/refData/refParallelWinners.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
index
92
92
66
66
10
68
99
99
91
63
96
97
97
91
30
10
90
3
13
6 changes: 2 additions & 4 deletions test/testBatch.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@

@testset "Single-CPU batch processing" begin

Random.seed!(1)
som = initGigaSOM(pbmc8_data, 10, 10)
som = initGigaSOM(pbmc8_data, 10, 10, seed = 1234)

#check whether the distributed version works the same
save_at(1, :test, pbmc8_data)
Random.seed!(1)
som2 = initGigaSOM(LoadedDataInfo(:test, [1]), 10, 10)
som2 = initGigaSOM(LoadedDataInfo(:test, [1]), 10, 10, seed = 1234)
@test som.codes == som2.codes
remove_from(1, :test)

Expand Down
Loading