In [1]:
import pandas as pd
import os, sys, re, glob
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
from statannotations.Annotator import Annotator

In [None]:
kmer=13

c_file = f"mutations_{kmer}bp_nulls_rmsk_CDS_counts.bed"
ncs_file = f"mutations_{kmer}bp_nulls_rmsk_noncoding_counts.bed"

c_std = []

n_bootstraps = 10000

with open(c_file,'r') as c_in:
	reader = csv.reader(c_in, delimiter='\t')
	lines = list(reader)
	# Bootstrap n_bootstraps times
	for b in range(0,n_bootstraps):
		bp = 0
		occ = 0
		random_rows = random.sample(lines, 1000)
		for row in random_rows:
			bp+=(int(row[2])-int(row[1])+1)
			occ+=int(row[3])
		density = 1000 * (occ/bp)
		c_std.append(density)

nc_std = []
with open(ncs_file,'r') as nc_in:
	reader = csv.reader(nc_in, delimiter='\t')
	lines = list(reader)
	# Bootstrap 100 times
	for b in range(0,n_bootstraps):
		bp = 0
		occ = 0
		random_rows = random.sample(lines, 1000)
		for row in random_rows:
			bp += (int(row[2])-int(row[1]))
			occ += int(row[3])
		density = 1000 * (occ/bp)
		nc_std.append(density)
		
df = pd.DataFrame()
nc_df = pd.DataFrame(nc_std)
nc_df['type'] = "Non-Coding"
c_df = pd.DataFrame(c_std)
c_df['type'] = "Coding"
df = pd.concat([nc_df, c_df])
df.columns = ["Density", "type"]


sns.set(style="whitegrid",
		palette="pastel"
		)

my_pal = {"Coding": "#24b1d1", "Non-Coding": "#ae24d1"}

fig, ax = plt.subplots()

g = sns.barplot(data=df, 
				x="type",
				y="Density", 
				ci="sd",
				capsize=.2, 
				errwidth=0.8, 
				estimator = np.mean,
				palette = my_pal, 
				ax = ax
				)

pairs = [('Coding', 'Non-Coding')]
annotator = Annotator(ax, pairs, data=df, x="type", y="Density", fliersize=0, width=0.5, linewidth=1, palette=my_pal)
annotator.configure(test='Mann-Whitney', text_format='star', loc='outside', comparisons_correction="BH")
annotator.apply_and_annotate()

plt.xlabel("")
plt.ylabel(f"Mutational Density (per kB)")
plt.tight_layout()
plt.show()
plt.close()