-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmer_analysis.R
405 lines (325 loc) · 23.5 KB
/
kmer_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#!/usr/bin/env Rscript
# Best run on cluster (it will probably take > 1day run locally..!) and one file per job.
# GenomicRanges, GenomicFeatures, rtracklayer and BSgenome.Mmusculus.UCSC.mm10 are available through Bioconductor (BiocManager::install())
# tidyverse and parallel packages available through CRAN (install.packages())
# Example command to submit to Babraham cluster:
# ssub -c 4 --mem=50G --email -o kmer_analysis.out Rscript --vanilla kmer_analysis.R output_dir=kmer_results/ ~/Annotations/Mus_musculus.GRCm38.96.gtf.gz ptbp1-cd8-24h-rep3_mapped_to_genome_single_peaks_annotated.tsv.gz k=8 p=4 window_size=31 sample_features=T sample_dataset=10000
# Except for GTF file and file to analyse which must be provided, all other arguments have defaults (see below)
# Based on method from Wang, Ule et al, PLoS Biol, 2010, iCLIP predicts the dual splicing effects of TIA-RNA interactions, PMID: 21048981:
# (i) iCLIP reads were associated with expressed genomic regions as defined by ENSEMBL (version Hg18/NCBI36). Each coding or non-coding gene was defined as its own region (in case of overlapping genes, the shorter gene always had the priority). Introns, 5' UTR, ORF and 3' UTR were considered as separate regions. (ii) iCLIP reads antisense to the transcriptional direction of the associated gene and reads that mapped to non-annotated genomic regions were removed before proceeding to further analysis. (iii) The control files were generated 100 times with randomised iCLIP positions. iv) Both in iCLIP and control files, the positions were extended by 10 nt in both directions, such that 21 nt long sequences were used for analysis. v) The occurrence (pentamer frequency) was calculated for each pentamer in each file. vi) The z-score was calculated for each pentamer as:
#(occurrence in iCLIP sequences - average occurrence in control sequences) / standard deviation of occurrence in control sequences
# One issue I came across for larger values of k was that very occasionally, no occurrences were found in the randomised sites.
# This was when running tests with only a few sample sets, but is theoretically possible (and particularly if subsampling?)
# One way to address this would be to add 1 to the sd - but decided better to just set these to NA as we cannot calculate accurately.
# Takes annotated crosslink site files, which should include gene/feature annotations, and assesses enrichment of kmers relative to random positions
# Positions are randomised across all genes with at least one xlink site, with the same distribution across features as in the iCLIP dataset
# ie, the random sites will include the same number of sites within the 3'UTR as there are 3'UTR xlink sites, etc.
# Input files must include a GTF file (ideally the same as was used for iCLIP data annotation)
# One or more iCLIP xlink files can be analysed - but note that it takes quite a long time to run so may be more advisable to run one at once (esp for large k).
# They should be tab-delimited text files containing significant crosslink sites - ie width 1, FDR < 0.05, not non-significant or clusters
# Column headers must include chromosome, either start or position, strand, Gene_name, Feature
# BED files directly from iMaps are not suitable as they are not annotated with genes and features - first run iCLIP_Genialis_feature_annotation.R
# On the command line, argument with extension .gtf(.gz) will be recognised as the GTF file
# Anything else that doesn't correspond to another argument will be treated as a file to analyse
# Users can define the size of the kmers as [k] (default 5) by including the argument k=[k]
# Users can define the size of the window assessed [w] (default 21, ie 10 bases up/downstream of site)
# by including the argument window_size=[w]
# By default will analyse all crosslink sites in the file.
# If instead you want to analyse a random subset of size [s], include the argument sample_dataset=[s]
# This would be useful eg for more consistent comparison between datasets. The individual feature zscores would still not be identical in size, but for the
# same type of data would likely be similar. If [s] cannot be coerced to numeric, it will be ignored.
# The sampling is done after removing non-coding/intergenic sites and unlocalised chromosome scaffolds (so need to take this into account when choosing the size),
# therefore for the whole gene, the number of sites considered will be identical if the same value of [s] is used across different datasets, as long as there are enough.
# Note that if there are not sufficient sites in the dataset, all sites will be retained; this will be flagged in the R output file.
# Including the argument sample_features=T will result in an additional column per feature, where all are downsampled to the smallest number of crosslink sites.
# NB if used together, the dataset sample will be taken first, so feature samples could end up quite small!
# Alternatively, sample_features=[f] will attempt to downsample all features to size [f], or take all crosslink sites if there are fewer than [f]
# If you use this, check the R output file as this will report if any features did not have sufficient crosslink sites.
# Intergenic xlink sites and those in ncRNAs will not be included in the analysis.
# Other feature types will be analysed separately (CDS, 3'UTR, 5'UTR, Intron).
# Additionally, zscore for all together will also be calculated.
# For each input file, an output file corresponding to the input file name will be saved in the output directory -
# defined using the argument output_dir=path/to/directory (default is current working directory - which will be the
# directory from which you submit the job if on the cluster).
# File extension of input filename will be replaced in output with eg "_5mer_zscores.txt" (where 5 is replaced by value of k)
# Contains kmers in column 1, then additional columns with z scores for all feature types together, or each separately
# To use parallel processing (will speed it up), set the number of threads to parallelise over [p] as p=[p]
# This is not possible on Windows where p must equal 1 (default).
# On the cluster, p can equal 2x number of cores specified to ssub (as each can handle 2 threads)
# kmers containing 'N' are removed from the output (sometimes found in introns)
# Note that occasionally the cluster job appears to 'fail' with an error message at the end of the log file - but having
# already finished processing and writing out the output.
# There is therefore a line printed when processing of each file is started and finished - if 'Finished processing [filename]'
# appears in the log file, the output for that file was successfully written out before it 'failed'.
library(GenomicRanges)
library(rtracklayer)
library(GenomicFeatures)
library(BSgenome.Mmusculus.UCSC.mm10)
library(parallel)
library(tidyverse)
#########################################################################################
# Code chunk 1a: Parse arguments - comment out 1b if reading arguments from command line
#########################################################################################
Args = commandArgs(trailingOnly=TRUE)
# For testing:
#Args <- c("iCLIP_combined_replicates.txt", "Mouse_GRCm38.90_chr17_100kb.gtf")
gtf_file <- Args[grepl("gtf$|gtf.gz$", Args)]
if(length(gtf_file) > 1) {
warning("More than one GTF file provided; using only first listed")
gtf_file <- gtf_file[1]
}
if(length(gtf_file) == 0) {
stop("No GTF file provided")
}
files_to_analyse <- Args[!(grepl("^k=|^output_dir=|^window_size=|^p=|^sample_features=|^sample_dataset=", Args) | grepl("gtf$|gtf.gz$", Args))]
if(length(files_to_analyse) == 0) {
stop("No files provided for analysis")
}
if(sum(grepl("^k=", Args)) > 0) {
k <- as.numeric(sub("^k=", "", Args[grepl("k=", Args)]))
} else { k <- 5 }
if(sum(grepl("^p=", Args)) > 0) {
p <- as.numeric(sub("^p=", "", Args[grepl("p=", Args)]))
} else { p <- 1 }
if(sum(grepl("^window_size=", Args)) > 0) {
window_size <- as.numeric(sub("window_size=", "", Args[grepl("^window_size=", Args)]))
} else { window_size <- 21 }
if(sum(grepl("^output_dir=", Args)) > 0) {
output_dir <- sub("output_dir=", "", Args[grepl("^output_dir=", Args)])
} else { output_dir <- "." }
if(sum(grepl("^sample_features=", Args)) > 0) {
sample_features <- sub("^sample_features=", "", Args[grepl("sample_features=", Args)])
if(sample_features == "T") {
sample_features <- T
} else {
sample_features <- as.numeric(sample_features)
}
} else { sample_features <- NA }
if(sum(grepl("^sample_dataset=", Args)) > 0) {
sample_dataset <- as.numeric(sub("sample_dataset=", "", Args[grepl("^sample_dataset=", Args)]))
} else { sample_dataset <- NA }
########################################################################################
# Code chunk 1b: Provide arguments - comment out 1a if providing the arguments yourself
########################################################################################
# Uncomment lines below and modify as appropriate:
# gtf_file <- "path/to/gtffile.gtf.gz"
# files_to_analyse <- c("path/to/textfile") # more than one can be provided, but will take a long time to run!!
# k <- 5 # size of kmer to search for
# window_size <- 21 # size of extended window around each site
# output_dir <- "." # change if you want output saved elsewhere
# p <- 1 # Number of threads to parallelise over - on Windows must equal 1, parallel processing not possible
# sample_dataset <- NA
# sample_features <- NA
###################################################################################################################
# Code chunk 2: Define all possible kmers of length k
###################################################################################################################
tibble(var = c("A", "C","G","T")) -> all_kmers
for(position in 2:k) { all_kmers %>% add_column(var = c("A","C","G","T"), .name_repair = "unique") -> all_kmers}
all_kmers %>%
expand.grid() %>%
unite("kmer", everything(), sep = "") %>%
deframe() ->
all_kmers
###################################################################################################################
# Code chunk 3: Import GTF file, extract features into separate GRanges objects for each type (excluding low TSL)
###################################################################################################################
gtf <- import(gtf_file)
seqlevelsStyle(gtf) <- "UCSC"
gtf <- gtf[seqnames(gtf) %in% seqnames(BSgenome.Mmusculus.UCSC.mm10)]
# compatible with either/both/neither having chr in names.
# won't work for MT but I think for this it's fine to exclude
# Some GTF file eg from Gencode have 'gene_type' and 'transcript_type' instead of 'gene/transcript_biotype'
# Also have 'ccdsid' rather than 'ccds_id'
# So we need to rename these if that's the case to ensure the rest of the code works
names(mcols(gtf))[names(mcols(gtf)) == "transcript_type"] <- "transcript_biotype"
names(mcols(gtf))[names(mcols(gtf)) == "gene_type"] <- "gene_biotype"
names(mcols(gtf))[names(mcols(gtf)) == "ccdsid"] <- "ccds_id"
# First, eliminate all transcript isoforms that have a TSL > 3 (or NA)
# transcript_support_level in the gtf file (at least v101 which I'm using to test..!) contains many annotations where the TSL is followed by a string in brackets, eg:
# "1 (assigned to previous version 13)". So first need to strip these out, before converting to numeric and selecting only those <=3
gtf$transcript_support_level <- as.numeric(sub(" .*$", "", gtf$transcript_support_level))
gtf <- gtf[!is.na(gtf$transcript_support_level)]
gtf <- gtf[gtf$transcript_support_level <=3]
# Include immunoglobulin & Tcr genes in protein coding ones. Exclude anything with pseudogene in transcript_biotype, but include any isoforms with protein_coding as transcript_biotype (can be designated pseudogenes if you just look at gene_biotype)
pc_genes <- gtf[(gtf$gene_biotype=="protein_coding"| grepl("^[IT][GR]_.*_gene$",gtf$gene_biotype) | gtf$transcript_biotype %in% "protein_coding") & !(grepl("pseudogene", gtf$transcript_biotype))]
## so this is taking everything that has a GENE biotype of protein_coding, and all transcripts from that gene regardless of whether they are coding or not (unless transcript biotype includes pseudogene).
# When you extract CDS/UTRs, this will ignore the non-coding isoforms - but introns would be taken from both when extracted from TxDb.
# I think it's best to remove non-coding isoforms, based on all transcript IDs for which there is >=1 CDS annotation (NB coding will include eg NMD transcript biotype)
all_coding_isoforms <- na.omit(unique(pc_genes$transcript_id[pc_genes$type == "CDS"]))
# Create TxDb object (just coding isoforms of pc_genes), then CDS/UTRs, introns can be extracted.
TxDb_pc <- makeTxDbFromGRanges(pc_genes[pc_genes$transcript_id %in% all_coding_isoforms]) # this will not include gene body annotation since no transcript ID, but does not matter for extracting gene segments
# Warning message:
# In .get_cds_IDX(type, phase) :
# The "phase" metadata column contains non-NA values for features of type stop_codon. This information was ignored.
# extract the different features in different GR objects from TxDb objects
CDS <- cdsBy(TxDb_pc, use.names = T)
CDS <- unlist(CDS)
mcols(CDS) <- NULL
CDS$Transcript_ID <- names(CDS)
CDS$Gene_ID <- gtf$gene_id[match(CDS$Transcript_ID, gtf$transcript_id)]
CDS$Gene_name <- gtf$gene_name[match(CDS$Transcript_ID, gtf$transcript_id)]
ThreeUTR <- threeUTRsByTranscript(TxDb_pc, use.names=T)
ThreeUTR <- unlist(ThreeUTR) # then names(ThreeUTR) can access the transcript IDs
mcols(ThreeUTR) <- NULL
ThreeUTR$Transcript_ID <- names(ThreeUTR)
ThreeUTR$Gene_ID <- gtf$gene_id[match(ThreeUTR$Transcript_ID, gtf$transcript_id)]
ThreeUTR$Gene_name <- gtf$gene_name[match(ThreeUTR$Transcript_ID, gtf$transcript_id)]
FiveUTR <- fiveUTRsByTranscript(TxDb_pc, use.names=T)
FiveUTR <- unlist(FiveUTR)
mcols(FiveUTR) <- NULL
FiveUTR$Transcript_ID <- names(FiveUTR)
FiveUTR$Gene_ID <- gtf$gene_id[match(FiveUTR$Transcript_ID, gtf$transcript_id)]
FiveUTR$Gene_name <- gtf$gene_name[match(FiveUTR$Transcript_ID, gtf$transcript_id)]
Intron <- intronsByTranscript(TxDb_pc,use.names=T)
Intron <- unlist(Intron)
mcols(Intron) <- NULL
Intron$Transcript_ID <- names(Intron)
Intron$Gene_ID <- gtf$gene_id[match(Intron$Transcript_ID, gtf$transcript_id)]
Intron$Gene_name <- gtf$gene_name[match(Intron$Transcript_ID, gtf$transcript_id)]
# I think we want to only search within genes with at least one xlink (surrogate for expression - if you had expression data could use that instead)
# Then want to get same relative proportions from the different feature types.
# Elisa had excluded the extended regions around sites from being allowed to be used in the random selection - but I think just leave them in.
# This gives a truly 'random' selection of sites, as opposed to a comparison of bound vs non-bound - which I think is reasonable
####################################################################################################################################
# Code chunk 4: Import and analyse kmer frequency/zscores for each xlink file in turn
####################################################################################################################################
window_size <- if_else(window_size %% 2 == 0, window_size + 1, window_size)
half_window <- window_size %/% 2
substring_startL <- 1+window_size-k
for(f in files_to_analyse) {
print(paste("Starting analysis of", f))
set.seed(123)
dataset_name <- sub("^.*/", "", sub("....$", "", sub(".gz", "", f)))
all_sites <- list()
##############################################################################################################
# 4a: Import xlink files for analysis; filter feature GRanges to only include genes with xlink sites
##############################################################################################################
read_tsv(f) %>%
rename_with(~ tolower(.x)) %>%
filter(!feature %in% c("ncExon", "ncIntron", "Intergenic")) %>%
filter(chromosome %in% seqnames(BSgenome.Mmusculus.UCSC.mm10)) -> xlink_sites
if(!is.na(sample_dataset)) {
if(sample_dataset <= nrow(xlink_sites)){
xlink_sites %>%
slice(sample(1:nrow(xlink_sites), sample_dataset)) -> xlink_sites
} else {
print("\n")
print(paste("*** File", f, "has", nrow(xlink_sites), "xlink sites; insufficient to downsample to size", sample_dataset, "; Retaining all ***"))
print("\n")
}
}
xlink_sites %>%
mutate(feature = factor(feature, levels = c("CDS", "UTR3", "UTR5", "Intron"))) %>%
arrange(feature) -> xlink_sites
feature_types <- as.character(xlink_sites$feature)
if(!is.na(sample_features)) {
sapply(unique(feature_types), function(x) sum(feature_types == x)) -> feature_counts
if(is.numeric(sample_features)) {
names(feature_counts)[which(feature_counts < sample_features)] -> too_short
if(length(too_short) > 0) {
print("\n")
print(paste("***", paste(too_short, collapse = ", "), "features have less than", sample_features, "xlink sites in file", f, "; downsampling only performed for other features ***"))
print("\n")
}
} else {
min(feature_counts) -> sample_features
}
}
if("start" %in% colnames(xlink_sites)) {
if("end" %in% colnames(xlink_sites)) {
all_sites[["XLsites"]] <- with(xlink_sites, GRanges(chromosome, IRanges(start, end), strand = strand))
} else {
all_sites[["XLsites"]] <- with(xlink_sites, GRanges(chromosome, IRanges(start, start), strand = strand))
}
} else if("position" %in% colnames(xlink_sites)) {
all_sites[["XLsites"]] <- with(xlink_sites, GRanges(chromosome, IRanges(position, position), strand = strand))
} else {
stop("Xlink site position column not identified - header should contain 'start' or 'position'")
}
CDS_subset <- GPos(GenomicRanges::reduce(CDS[CDS$Gene_name %in% xlink_sites$gene_name]))
ThreeUTR_subset <- GPos(GenomicRanges::reduce(ThreeUTR[ThreeUTR$Gene_name %in% xlink_sites$gene_name]))
FiveUTR_subset <- GPos(GenomicRanges::reduce(FiveUTR[FiveUTR$Gene_name %in% xlink_sites$gene_name]))
Intron_subset <- GPos(GenomicRanges::reduce(Intron[Intron$Gene_name %in% xlink_sites$gene_name]))
######################################################################################################################
# 4b: Generate 100 sets of randomly sampled sites with the same distribution between feature types as the xlink sites
######################################################################################################################
for(i in 1:100) {
as(c(
sample(CDS_subset, sum(xlink_sites$feature == "CDS")),
sample(ThreeUTR_subset, sum(xlink_sites$feature == "UTR3")),
sample(FiveUTR_subset, sum(xlink_sites$feature == "UTR5")),
sample(Intron_subset, sum(xlink_sites$feature == "Intron"))
), "GRanges") -> all_sites[[paste0("sample", i)]]
}
######################################################################################################################
# 4c: Create extended windows around xl/sampled sites and calculate frequency of kmers
######################################################################################################################
mclapply(all_sites, function(x) {
x %>%
resize(window_size, fix = "center") -> sites
getSeq(BSgenome.Mmusculus.UCSC.mm10, sites, as.character = T) %>%
lapply(function(w) as_tibble_col(unique(substring(w, 1:substring_startL, k:window_size)), column_name = "kmer")) -> seq_substrings
seq_substrings %>%
split(feature_types) -> seq_substrings
if(!is.na(sample_features)) {
seq_substrings %>%
lapply(function(x) sample(x, min(sample_features, length(x)))) -> downsampled_substrings
seq_substrings %>%
unlist(recursive = F) %>%
sample(min(nrow(xlink_sites), sample_features)) -> downsampled_substrings$Gene
names(downsampled_substrings) <- sapply(names(downsampled_substrings), function(x) paste0(x, "_downsampled"))
seq_substrings <- c(seq_substrings, downsampled_substrings)
}
seq_substrings %>%
lapply(bind_rows) %>%
bind_rows(.id = "feature") %>%
group_by(kmer, feature) %>%
summarise(frequency = n()) %>%
ungroup()
}, mc.cores = p) %>%
bind_rows(.id = "Sample") -> kmer_usage
##############################################################################################################
# 4d: Calculate z-scores and write out
##############################################################################################################
kmer_usage %>%
pivot_wider(names_from = feature, values_from = frequency) %>%
mutate(across(where(is.numeric), ~ if_else(is.na(.x), 0, as.numeric(.x)))) %>%
# If a kmer is not there in any sample or the xlinks, it will be ignored, otherwise add 0s where not found
mutate(Gene = CDS+Intron+UTR3+UTR5) %>%
pivot_longer(-c(Sample, kmer), names_to = "feature", values_to = "Frequency") %>%
mutate(Type = if_else(Sample == "XLsites", "XLsites", "Random")) %>%
group_by(Type, kmer, feature) %>%
summarise(Mean = mean(Frequency), StDev = sd(Frequency)) %>%
ungroup() %>%
#mutate(StDev = StDev + 1) %>% # Avoids issue of having s.d. = 0 if no samples have the kmer - but instead of this now converting Inf to NA later
pivot_wider(id_cols = kmer, names_from = c(feature, Type), values_from = c(Mean, StDev)) %>%
mutate(
zscore_gene = (Mean_Gene_XLsites - Mean_Gene_Random)/StDev_Gene_Random,
zscore_3UTR = (Mean_UTR3_XLsites - Mean_UTR3_Random)/StDev_UTR3_Random,
zscore_CDS = (Mean_CDS_XLsites - Mean_CDS_Random)/StDev_CDS_Random,
zscore_5UTR = (Mean_UTR5_XLsites - Mean_UTR5_Random)/StDev_UTR5_Random,
zscore_Intron = (Mean_Intron_XLsites - Mean_Intron_Random)/StDev_Intron_Random
) -> kmer_usage
if(!is.na(sample_features)) {
kmer_usage %>%
mutate(
zscore_gene_downsampled = (Mean_Gene_downsampled_XLsites - Mean_Gene_downsampled_Random)/StDev_Gene_downsampled_Random,
zscore_3UTR_downsampled = (Mean_UTR3_downsampled_XLsites - Mean_UTR3_downsampled_Random)/StDev_UTR3_downsampled_Random,
zscore_CDS_downsampled = (Mean_CDS_downsampled_XLsites - Mean_CDS_downsampled_Random)/StDev_CDS_downsampled_Random,
zscore_5UTR_downsampled = (Mean_UTR5_downsampled_XLsites - Mean_UTR5_downsampled_Random)/StDev_UTR5_downsampled_Random,
zscore_Intron_downsampled = (Mean_Intron_downsampled_XLsites - Mean_Intron_downsampled_Random)/StDev_Intron_downsampled_Random
) -> kmer_usage
colnames(kmer_usage)[colnames(kmer_usage) == "zscore_gene_downsampled"] <- paste0("zscore_gene_downsampled_", min(sample_features, sum(feature_counts)))
colnames(kmer_usage)[colnames(kmer_usage) == "zscore_3UTR_downsampled"] <- paste0("zscore_3UTR_downsampled_", min(sample_features, feature_counts["UTR3"]))
colnames(kmer_usage)[colnames(kmer_usage) == "zscore_CDS_downsampled"] <- paste0("zscore_CDS_downsampled_", min(sample_features, feature_counts["CDS"]))
colnames(kmer_usage)[colnames(kmer_usage) == "zscore_5UTR_downsampled"] <- paste0("zscore_5UTR_downsampled_", min(sample_features, feature_counts["UTR5"]))
colnames(kmer_usage)[colnames(kmer_usage) == "zscore_Intron_downsampled"] <- paste0("zscore_Intron_downsampled_", min(sample_features, feature_counts["Intron"]))
}
kmer_usage %>%
select(kmer, starts_with("zscore")) %>%
arrange(desc(zscore_gene)) %>%
mutate(across(where(is.numeric), ~ if_else(is.infinite(.x), NA_real_, .x))) %>%
filter(!grepl("N", kmer)) %>%
write_tsv(paste0(output_dir, "/", dataset_name, "_", k, "mer_zscores.txt"))
print(paste("Finished analysis of", f))
}