Skip to content

Commit

Permalink
Merge pull request #149 from LCSB-BioCore/develop
Browse files Browse the repository at this point in the history
Regular merge of develop
  • Loading branch information
laurentheirendt committed Oct 6, 2020
2 parents 88db15a + 670a47d commit f4e712b
Show file tree
Hide file tree
Showing 14 changed files with 152 additions and 122 deletions.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "GigaSOM"
uuid = "a03a9c34-069e-5582-a11c-5c984cab887c"
version = "0.5.1"
version = "0.6.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand All @@ -17,6 +17,7 @@ NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"

Expand Down
3 changes: 1 addition & 2 deletions src/GigaSOM.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@ The documentation is here: http://LCSB-BioCore.github.io/GigaSOM.jl

module GigaSOM

using CSV
using DataFrames
using Distances
using Distributed
using Distributions
using FCSFiles
using FileIO
using DistributedArrays
using XLSX
using NearestNeighbors
using Serialization
using StableRNGs

include("base/structs.jl")

Expand Down
105 changes: 65 additions & 40 deletions src/analysis/core.jl
Original file line number Diff line number Diff line change
@@ -1,61 +1,86 @@
"""
initGigaSOM(train, xdim, ydim = xdim)
initGigaSOM(data, args...)
Initializes a SOM by random selection from the training data.
Initializes a SOM by random selection from the training data. A generic
overload that works for matrices and DataFrames that can be coerced to
`Matrix{Float64}`. Other arguments are passed to the data-independent
`initGigaSOM`.
# Arguments:
- `train`: codeBook vector as random input matrix from random workers
- `xdim, ydim`: geometry of the SOM
Arguments:
- `data`: matrix of data for running the initialization
"""
function initGigaSOM(train, xdim::Int64, ydim::Int64 = xdim)
function initGigaSOM(data::Union{Matrix,DataFrame}, args...; kwargs...)

if typeof(train) == DataFrame
colNames = [String(x) for x in names(train)]
else
colNames = ["x$i" for i = 1:size(train, 2)]
@debug "assuming default colNames"
end
d = Matrix{Float64}(data)

train = Matrix{Float64}(train)
(n, ncol) = size(d)
means = [sum(d[:, i]) / n for i = 1:ncol]
sdevs = [sqrt(sum((d[:, i] .- means[i]) .^ 2.0) / n) for i = 1:ncol]

numCodes = xdim * ydim
return initGigaSOM(ncol, means, sdevs, args...; kwargs...)
end

# initialise the codes with random samples
codes = train[rand(1:size(train, 1), numCodes), :]
grid = gridRectangular(xdim, ydim)
"""
function initGigaSOM(data::LoadedDataInfo,
xdim::Int64, ydim::Int64 = xdim;
seed=rand(Int), rng=StableRNG(seed))
# make SOM object:
som = Som(
codes = codes,
colNames = colNames,
xdim = xdim,
ydim = ydim,
numCodes = numCodes,
grid = grid,
)
return som
`initGigaSOM` overload for working with distributed-style `LoadedDataInfo`
data. The rest of the arguments is passed to the data-independent
`initGigaSOM`.
Arguments:
- `data`: a `LoadedDataInfo` object with the distributed dataset matrix
"""
function initGigaSOM(data::LoadedDataInfo, args...; kwargs...)
ncol = get_val_from(data.workers[1], :(size($(data.val))[2]))
(means, sdevs) = dstat(data, Vector(1:ncol))

initGigaSOM(ncol, means, sdevs, args...; kwargs...)
end

"""
initGigaSOM(trainInfo::LoadedDataInfo,
xdim::Int64, ydim :: Int64 = xdim)
function initGigaSOM(ncol::Int64,
means::Vector{Float64}, sdevs::Vector{Float64},
xdim::Int64, ydim::Int64 = xdim;
seed = rand(Int), rng = StableRNG(seed))
`initGigaSOM` overload for working with distributed-style `LoadedDataInfo`
data. The rest of arguments is the same as in `initGigaSOM`.
Generate a stable random initial SOM with the random distribution that matches the parameters.
Arguments:
- `ncol`: number of desired data columns
- `means`, `sdevs`: vectors that describe the data distribution, both of size `ncol`
- `xdim`, `ydim`: Size of the SOM
- `seed`: a seed (defaults to random seed from the current default random generator
- `rng`: a random number generator to be used (defaults to a `StableRNG` initialized with the `seed`)
Note that this function only uses the data saved on the first worker for
initialization, and the init work is actually done on that worker to avoid
unnecessary data copying.
Returns: a new `Som` structure
"""
function initGigaSOM(trainInfo::LoadedDataInfo, xdim::Int64, ydim::Int64 = xdim)
function initGigaSOM(
ncol::Int64,
means::Vector{Float64},
sdevs::Vector{Float64},
xdim::Int64,
ydim::Int64 = xdim;
seed = rand(UInt),
rng = StableRNG(seed),
)

# Snatch the init data from the first available worker (for he cares not).
return get_val_from(
trainInfo.workers[1],
:(initGigaSOM($(trainInfo.val), $xdim, $ydim)),
)
numCodes = xdim * ydim
grid = gridRectangular(xdim, ydim)

# Initialize with an unbiased random gaussian with same mean/sd as the data
# in each dimension
codes = randn(rng, (numCodes, ncol))
for col = 1:ncol
codes[:, col] .*= sdevs[col]
codes[:, col] .+= means[col]
end

return Som(codes = codes, xdim = xdim, ydim = ydim, grid = grid)
end


"""
trainGigaSOM(som::Som, dInfo::LoadedDataInfo;
kernelFun::Function = gaussianKernel,
Expand Down
18 changes: 11 additions & 7 deletions src/base/dataops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,15 @@ function combine_stats((s1, sqs1, n1), (s2, sqs2, n2))
end

"""
dstat(dInfo::LoadedDataInfo, columns::Vector{Int})
dstat(dInfo::LoadedDataInfo, columns::Vector{Int})::Tuple{Vector{Float64}, Vector{Float64}}
Compute mean and standard deviation of the columns in dataset. Returns a tuple
with a vector of means in `columns`, and a vector of corresponding sdevs.
"""
function dstat(dInfo::LoadedDataInfo, columns::Vector{Int})
function dstat(
dInfo::LoadedDataInfo,
columns::Vector{Int},
)::Tuple{Vector{Float64},Vector{Float64}}

sum_squares = x -> sum(x .^ 2)

Expand All @@ -107,22 +110,23 @@ function dstat(dInfo::LoadedDataInfo, columns::Vector{Int})
(sums, sqsums, ns) = distributed_mapreduce(dInfo, get_stats, combine_stats)

return (
sums ./ ns, #means
sqrt.(sqsums ./ ns - (sums ./ ns) .^ 2), #sdevs
(sums./ns)[1, :], #means
(sqrt.(sqsums ./ ns - (sums ./ ns) .^ 2))[1, :], #sdevs
)
end

"""
dstat_buckets(dInfo::LoadedDataInfo, nbuckets::Int, buckets::LoadedDataInfo, columns::Vector{Int})
dstat_buckets(dInfo::LoadedDataInfo, nbuckets::Int, buckets::LoadedDataInfo, columns::Vector{Int})::Tuple{Matrix{Float64}, Matrix{Float64}}
A version of `dstat` that works with bucketing information (e.g. clusters).
A version of `dstat` that works with bucketing information (e.g. clusters);
returns a tuple of matrices.
"""
function dstat_buckets(
dInfo::LoadedDataInfo,
nbuckets::Int,
buckets::LoadedDataInfo,
columns::Vector{Int},
)
)::Tuple{Matrix{Float64},Matrix{Float64}}
# this produces a triplet of matrices (1 row per each bucket)
get_bucketed_stats =
(d, b) -> (
Expand Down
19 changes: 8 additions & 11 deletions src/base/structs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ Structure to hold all data of a trained SOM.
# Fields:
- `codes::Array{Float64,2}`: 2D-array of codebook vectors. One vector per row
- `colNames::Array{String,1}`: names of the attribute with which the SOM is trained
- `xdim::Int`: number of neurons in x-direction
- `ydim::Int`: number of neurons in y-direction
- `numCodes::Int`: total number of neurons
Expand All @@ -14,21 +13,19 @@ Structure to hold all data of a trained SOM.
3 columns (x,y,z) for spherical maps)
"""
mutable struct Som
codes::Array{Float64,2}
colNames::Array{String}
codes::Matrix{Float64}
xdim::Int
ydim::Int
numCodes::Int
grid::Array{Float64,2}
grid::Matrix{Float64}

Som(;
codes::Array{Float64} = Array{Float64}(0),
colNames::Array{String,1} = Array{String}(0),
xdim::Int = 1,
ydim::Int = 1,
numCodes::Int = 1,
grid::Array{Float64,2} = zeros(1, 1),
) = new(codes, colNames, xdim, ydim, numCodes, grid)
codes::Matrix{Float64},
xdim::Int,
ydim::Int,
numCodes::Int = xdim * ydim,
grid::Matrix{Float64},
) = new(codes, xdim, ydim, numCodes, grid)
end

"""
Expand Down
7 changes: 7 additions & 0 deletions test/refData/convert-gen-to-ref.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh
head -n 11 ../genData/batchWinners.csv > refBatchWinners.csv
head -n 11 ../genData/parallelWinners.csv > refParallelWinners.csv
head -n 11 ../genData/batchEmbedded.csv > refBatchEmbedded.csv
head -n 11 ../genData/parallelEmbedded.csv > refParallelEmbedded.csv
head -n 11 ../genData/batchDfCodes.csv > refBatchDfCodes.csv
head -n 11 ../genData/parallelDfCodes.csv > refParallelDfCodes.csv
20 changes: 10 additions & 10 deletions test/refData/refBatchDfCodes.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7
-0.011651251029458529,0.002166440919825469,-0.016107773228903503,-0.02267979761084743,0.011138154608847282,0.00367645725668631,-0.008512117853380383,-0.025609555046142884,0.008291097574972593,-0.013889722801698756
-0.015951824546522195,-0.002697429870211619,-0.022851480129853097,-0.02338515467852531,0.007879183676403521,0.0033409250992698543,-0.010610960736475918,-0.025460157248426617,0.005161131828402003,-0.011165165902673387
-0.018460099430044313,-0.0053348922680251645,-0.027923343496897145,-0.02387883890843739,0.005214373295028115,0.0031519614803055137,-0.01374838399229439,-0.024558407368225888,0.0012409770364297595,-0.0062790908403473475
-0.020757029484934443,-0.008322760653328196,-0.03196278126936008,-0.023745826001870852,0.0033909949414117494,0.0037307918688530904,-0.01626242746857233,-0.022959834566592886,-0.001965983279393479,-0.0019169115424721547
-0.022995554919990426,-0.010606081583462008,-0.03427623370658472,-0.024576402052169207,0.0013448984967097538,0.004349501067549502,-0.016997607712597014,-0.02179101982437115,-0.004980685900616922,0.0015353782222695895
-0.024077115259944812,-0.011714510684187556,-0.03575164678974174,-0.02496006999043605,-0.0005615706517413436,0.005127613251056575,-0.016589968554867322,-0.02196648608402114,-0.008080566306154223,0.005078697268390525
-0.02280491786388036,-0.012088734144116473,-0.03502474167057429,-0.024776972198643336,-0.0030730612729294106,0.004979240652566521,-0.01736939673631981,-0.021418503939703726,-0.012416538137455671,0.008222161851395175
-0.021361705117632003,-0.010758591549803553,-0.03405254052307793,-0.023824976347450493,-0.004499831340097048,0.004694851720409449,-0.016685463945211376,-0.019512632412737783,-0.015737142966254614,0.01338769742196532
-0.019292562591419423,-0.009278789640504858,-0.03135687794812301,-0.022962237895561736,-0.006729952834781199,0.002878891330281074,-0.016658210623838203,-0.017903387380575037,-0.020056421918357603,0.019975333140029673
-0.016247269914707884,-0.00740984542325823,-0.028062505860498892,-0.02231750214052588,-0.009455430561750876,0.000786166616881858,-0.01656328758034033,-0.01601653921372662,-0.024373583393098794,0.025644294109764914
-0.020661387816701828,-0.016161120779088663,-0.01249623627759775,-0.0034145410849788877,0.014807586926559653,0.0046715057568052516,0.01579385076647087,0.0009447735391436881,0.02190928173933184,-0.015697400125440446
-0.02005093062889222,-0.014269035817494969,-0.013039351840076645,-0.002985186876171628,0.009609145977086965,0.00702660927384001,0.01650154435760052,0.0012497263037976013,0.018304252246390512,-0.01903168396592879
-0.018788471362745307,-0.011674867151168998,-0.013892318528007385,-0.0028942218598770036,0.005187724890544429,0.009408032474714481,0.01738428612136856,0.0008292768855502434,0.014789337846995957,-0.022451985720898553
-0.017586974966765687,-0.009684792719596318,-0.014788360098695824,-0.002844458402612456,0.0011821276775411634,0.011058966466700413,0.016998801810468454,0.0001696701916532979,0.010787772791408978,-0.025702596416148988
-0.021236637394984326,-0.010443009847073298,-0.016317806826461025,-0.001040240702235489,0.0017567656911396936,0.016129470212235868,0.019983499116639214,0.0024218829121462012,0.01348036066165613,-0.027178442918584813
-0.02323930293506252,-0.010343130170832203,-0.0160472130856683,0.0005814235461290909,0.0022876975871033013,0.018752355039775046,0.022050505097252898,0.003382194826886934,0.014486413381663501,-0.027242157143555618
-0.022565262666134848,-0.008067661973959712,-0.014523685185202789,0.0017885539541355761,0.002905441205575069,0.019318470482428095,0.023260382337895385,0.0038037239985614377,0.014250101736801073,-0.026384829504991988
-0.021386868010550755,-0.006582720125070556,-0.013742927636424877,0.0025294938554544506,0.002019437034627474,0.020238852219692446,0.02359309239589711,0.004139563745452907,0.013154023718769371,-0.022919034661042337
-0.019685222773768392,-0.005260420938398561,-0.01263897655387842,0.0018577552418864714,0.0007079395479014865,0.020240301907265355,0.023509393021684127,0.0031884247090666435,0.010853678978840937,-0.019119186692227116
-0.01817652298806144,-0.004568729107711974,-0.012247247192667432,0.0019481924139792522,-0.0017780017327565826,0.02090332452866241,0.020879877315333684,0.002788258677921433,0.007710635361434031,-0.014495702832397588
20 changes: 10 additions & 10 deletions test/refData/refBatchEmbedded.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
x1,x2
1.5001213674422897,8.880687995083614
1.2237981567666019,8.782808680684086
5.6341242988459594,0.003916876256086727
0.34120940110869247,7.976589652696513
7.414592953032319,0.12429919116488143
3.5053583626488267,0.08500795962385448
8.741142530996362,4.582317518346491
8.781770646925953,1.1656467549268985
9.15192688427219,0.07446585923210677
4.115778473902338,0.0563890768039573
7.853292848987698,8.720410251312108
7.76525592228932,8.764961186548483
0.05420208119934999,7.128494184649301
7.662565234267279,8.789346146054893
0.2313774377228306,7.992453408932434
6.718647228133474,8.71364554593444
7.310434122948076,8.85349622911063
1.1640093106430396,8.780329024131897
7.085379535308563,0.12394846652869095
5.982920181010032,4.171484706899294
12 changes: 6 additions & 6 deletions test/refData/refBatchWinners.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
index
92
100
100
91
6
99
91
97
99
91
10
4
10
10
6
5
20 changes: 10 additions & 10 deletions test/refData/refParallelDfCodes.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7
-0.17640967142561337,0.4685075475625074,-0.563492455862733,-0.22178816350494124,0.5017208417273646,-0.03261443423876599,-0.3529187887429754,-0.290477457805388,0.8104538714781182,-0.998356121477298
-0.7831021091922231,-0.1387601640904325,-0.7141402998872505,-0.1543993358965959,-0.023269147383249467,0.1584326532614666,-0.022852972837362744,-0.2932731477925736,0.5757650485172153,-1.0812869624104249
0.008927988103811249,0.4091058632556585,-0.8722452167373481,-0.4585758423921651,-0.40986988557257265,-0.36638465240840806,-0.5493908695156264,-0.37945184407422816,-0.2676889658062833,-0.9016287629952086
-0.4110168375156833,-0.24935670477094934,-0.8579259483467333,0.4579877893841562,0.49735708414528346,-0.0011058499284762261,-0.6880093365699184,-0.17897455589509492,0.3289355183541678,-0.4719020523756622
-1.0226094296799497,-0.9840566016813554,-0.7097172391199558,-0.16485480747210984,0.11492309924943464,0.22047196454693038,-0.502125790229743,-0.02049200390243572,0.33706614752231373,-0.894516830707265
-1.2095810804913587,-1.1896783882789272,-0.81483190688843,-0.35858500308536834,-0.0625575067983978,0.6661787916264342,0.144441488758504,-0.23851684605223872,0.043665352825085546,-0.3727782223859913
-0.642941166764806,-1.5094528640377423,-0.7536810820703415,-0.37265948389744713,-0.4882089819026275,0.01753753455821686,-0.44332239042127136,-0.2862474935367596,-0.606802065413238,-0.6506740054231588
-0.5723070637131475,-0.23835188752416503,-0.9230290505068274,-0.4248295627471601,-0.2902891415288133,1.0892987177717286,0.11766517398469416,-0.20569072343465256,-0.2609366770976574,0.43842780024682604
-1.1410063412612652,-0.541407804316432,-0.8366965863280671,-0.30047779713732037,0.08012450796075446,0.13989080364661932,-0.0348520560461668,-0.27298795522471936,-0.09480486991814785,0.9046371040902668
-0.32086931489919535,0.008572799776121031,-0.7726234805473643,-0.31124537949637376,-0.3731914270086051,-0.2715889271941862,-0.3089612139429518,-0.2100110845003733,-0.43372445501215534,0.9200262143737115
-0.8018623096228251,-0.8369378251602092,-0.42235875792375416,0.03181298171449356,1.1023176364122644,-0.1963262502082471,0.14292286933337084,-0.04495525968748685,0.9488704015837622,-0.9502693235574208
-0.5625521973628088,-2.0296778784502623,-0.35526879305589104,-0.3107117523629649,-0.691540155508728,-0.4093496225834384,0.4523724271046617,-0.009974066523155774,-0.4068308049793661,-0.6095350731407303
0.5276213762346692,-0.13928455906903522,0.0260388772472196,-0.5305729781019549,-0.3946096763960851,-0.5826133567572871,1.261368184898035,-0.37813700493775537,0.6684088483586492,-0.5842763194864915
-0.49203525558430333,0.006125170365176934,-0.7177418471018223,-0.34790913951628977,-0.44641248247014126,-0.40822054375201805,-0.19201882406973556,-0.3264692598105841,-0.4422401708848413,-1.0204324390416406
-0.7946172931478637,-0.04379896708998755,-0.8236749140239338,-0.38413984385557476,-0.4578755791225342,-0.006919619778465148,-0.1544742520669272,-0.24564204881872534,-0.14253950453684816,-1.0500842859108013
-1.0486033461015627,-0.8956741167306301,-0.6379896360088917,-0.12698033338253267,-0.2186917578539655,1.2213823663626238,0.32907333598261723,-0.12740051860331436,0.32155185312345375,-0.956097609243189
-0.5974402427459592,0.39237883711545724,-0.07883772208890998,0.4015202131734113,0.8479696160880946,0.8049105112428624,1.1140073920103057,-0.09581685765991838,1.033044071876181,-1.174732014176624
-0.7872516308600919,0.21897600381375384,-0.4402878230721225,2.163471850204101,0.5809775152776391,0.7979704573811573,0.5718975450381883,2.3417189247773846,1.5471819106506124,-0.9461146224865068
0.039924351634507416,0.3443437618857008,-0.13265299964277888,-0.08261247799182267,-0.039537534454404076,0.5981065374507042,2.2159468572244885,-0.01778470058774384,0.13892743607012503,0.14216618054090033
-0.5915097660623094,0.07164712471285221,-0.6202069574325458,0.2942533083281945,-0.371048038863417,1.571561436383151,0.5579442193175757,0.5739029477810871,-0.07997738337816146,0.41916332336585954
20 changes: 10 additions & 10 deletions test/refData/refParallelEmbedded.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
x1,x2
1.6744019997736426,8.103683903608214
1.7010846112311093,8.401485398046395
7.015264642225017,3.5344531046086956
2.7854112914578004,6.833764156606785
8.544287420382766,0.8639031797261263
7.203534969749915,5.661315702180364
8.155603686885433,2.180030431709865
8.412521380846723,2.1079805646140874
9.029942763218518,4.162009375605646
2.8330767297731123,2.1536182096362584
8.019707426546399,7.484522208408344
6.635774256956503,7.521585380705817
2.7931129447029233,5.013450509762322
3.3489279330827224,5.592905722888017
3.4888898275483258,8.60900213852842
5.5620028677248365,5.334199616261413
5.307053172716869,8.604732622951328
3.8286893566636997,8.88841904590264
5.737121198824499,2.8540970674331145
2.567787983594243,1.6517987546509052
18 changes: 9 additions & 9 deletions test/refData/refParallelWinners.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
index
92
92
66
66
10
68
99
99
91
63
96
97
97
91
30
10
90
3
13
6 changes: 2 additions & 4 deletions test/testBatch.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@

@testset "Single-CPU batch processing" begin

Random.seed!(1)
som = initGigaSOM(pbmc8_data, 10, 10)
som = initGigaSOM(pbmc8_data, 10, 10, seed = 1234)

#check whether the distributed version works the same
save_at(1, :test, pbmc8_data)
Random.seed!(1)
som2 = initGigaSOM(LoadedDataInfo(:test, [1]), 10, 10)
som2 = initGigaSOM(LoadedDataInfo(:test, [1]), 10, 10, seed = 1234)
@test som.codes == som2.codes
remove_from(1, :test)

Expand Down
Loading

0 comments on commit f4e712b

Please sign in to comment.