Skip to content

Commit

Permalink
added DupReads correction function similair to FastQC.
Browse files Browse the repository at this point in the history
  • Loading branch information
MoSafi2 committed May 13, 2024
1 parent e9da399 commit 0ea0b2e
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 43 deletions.
2 changes: 1 addition & 1 deletion blazeseq/record.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ struct FastqRecord(Sized, Stringable, CollectionElement):

@always_inline
fn hash[bits: Int = 3](self) -> UInt64:
"""Hashes the first xx bp (if possible) into one 64bit. Max length is 64/nBits per bp"""
"""Hashes the first xx bp (if possible) into one 64bit. Max length is 64/nBits per bp."""
var hash: UInt64 = 0
var rnge: Int = 64 // bits
var mask = (0b1 << bits) - 1
Expand Down
121 changes: 79 additions & 42 deletions blazeseq/stats.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ struct FullStats(Stringable, CollectionElement):


@always_inline
fn plot(self) raises:
fn plot(inout self) raises:
self.bp_dist.plot()
self.cg_content.plot()
self.len_dist.plot()
Expand Down Expand Up @@ -181,12 +181,21 @@ struct CGContent(Analyser):
struct DupReads(Analyser):
var unique_dict: Dict[FastqRecord, Int64]
var unique_reads: Int
var count_at_max: Int
var n: Int
var corrected_counts: Dict[Int, Float64]

fn __init__(inout self):
self.unique_dict = Dict[FastqRecord, Int64]()
self.unique_reads = 0
self.count_at_max = 0
self.n = 0
self.corrected_counts = Dict[Int, Float64]()

fn tally_read(inout self, record: FastqRecord):

self.n += 1

if record in self.unique_dict:
try:
self.unique_dict[record] += 1
Expand All @@ -195,11 +204,66 @@ struct DupReads(Analyser):
print("error")
pass

if self.unique_reads < MAX_READS:
if self.unique_reads <= MAX_READS:
self.unique_dict[record] = 1
self.unique_reads += 1
if self.unique_reads == MAX_READS:
self.count_at_max = self.n
else:
pass



fn predict_reads(inout self):
#Construct Duplication levels dict
var dup_dict = Dict[Int, Int]()
for entry in self.unique_dict.values():
if int(entry[]) in dup_dict:
try:
dup_dict[int(entry[])] += 1
except:
print("error")
else:
dup_dict[int(entry[])] = 0


# Correct reads levels
var corrected_reads = Dict[Int, Float64]()
for entry in dup_dict:
try:
var count = dup_dict[entry[]]
var level = entry[]
var corrected_count = self.correct_values(level, count, self.count_at_max, self.n)
corrected_reads[level] = corrected_count
except:
print("Error")

self.corrected_counts = corrected_reads

#Check how it is done in Falco.
@staticmethod
fn correct_values(dup_level: Int, count_at_level: Int, count_at_max: Int, total_count: Int ) -> Float64:
if count_at_max == total_count:
return count_at_level

if total_count - count_at_level < count_at_max:
return count_at_level

var pNotSeeingAtLimit: Float64 = 1
var limitOfCaring = Float64(1) - (count_at_level / (count_at_level + 0.01))

for i in range(count_at_max):
pNotSeeingAtLimit *= ((total_count -i ) - dup_level) / (total_count - i)

if pNotSeeingAtLimit < limitOfCaring:
pNotSeeingAtLimit = 0
break

var pSeeingAtLimit:Float64 = 1 - pNotSeeingAtLimit
var trueCount = count_at_level / pSeeingAtLimit

return trueCount


fn report(self) -> Tensor[DType.int64]:
var report = Tensor[DType.int64](1)
Expand All @@ -210,12 +274,19 @@ struct DupReads(Analyser):
return String("\nNumber of duplicated reads is") + self.report()


fn plot(self) raises:
var x = self.unique_dict.values()
var temp_tensor = Tensor[DType.int64](len(x))
for i in range(len(x)):
temp_tensor[i] = x.__next__()[]

fn plot(inout self) raises:
print("Enterd Plotting")
self.predict_reads()
print("Predicted Reads")
var temp_tensor = Tensor[DType.int64](len(self.corrected_counts) * 2 + 1)
var i = 0
for index in self.corrected_counts:
print(index[])
print(self.corrected_counts[index[]])
temp_tensor[i * 2] = index[]
temp_tensor[i * 2 + 1] = self.corrected_counts[index[]]
i += 1

var np = Python.import_module("numpy")
var arr = tensor_to_numpy_1d(temp_tensor)
np.save("arr_DupReads.npy", arr)
Expand Down Expand Up @@ -413,40 +484,6 @@ struct KmerContent[bits: Int = 3](Analyser):
fn __str__(self) -> String:
return String("\nhash count table is ") + str(self.hash_counts)


# @value
# struct DuplicateReads(Analyser):
# var dup_reads: Tensor[DType.int64]
# var hashes: Tensor[DType.uint64]

# fn __init__(inout self):
# self.dup_reads = Tensor[DType.int64](TensorShape(100_000), 0)
# self.hashes = Tensor[DType.uint64](TensorShape(100_000), 0)

# @always_inline
# fn tally_read(inout self, record: FastqRecord):
# var index = int(record.hash() % 100_000)

# #New hash
# if self.hashes[index] == 0:
# self.hashes[index] = record.hash()
# self.dup_reads[index] += 1
# else:
# if self.hashes[index] == record.hash():
# self.dup_reads[index] += 1


# fn plot(self) raises:
# var np = Python.import_module("numpy")
# var arr = tensor_to_numpy_1d(self.dup_reads)
# np.save("arr_DupReads.npy", arr)

# fn report(self) -> Tensor[DType.int64]:
# return self.dup_reads

# fn __str__(self) -> String:
# return String("\nDuplicate reads is: ") + str(self.dup_reads)


# TODO: Make this also parametrized on the number of bits per bp
fn _seq_to_hash(seq: String) -> UInt64:
Expand Down

0 comments on commit 0ea0b2e

Please sign in to comment.