In [None]:
from scipy.stats import norm
import pandas as pd 
import numpy as np 
import seaborn as sns 
import math
import random
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture
from sklearn.covariance import EmpiricalCovariance
from sklearn.covariance import EllipticEnvelope
from sklearn.covariance import LedoitWolf
from sklearn.covariance import MinCovDet

from statsmodels.graphics.gofplots import qqplot
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.graphics.gofplots import qqplot_2samples

from scipy.stats import poisson
from scipy.stats import norm

plt.style.use('ggplot')
sns.set_style('whitegrid')
'''
some good color maps:
 - CMRmap
 - Dark2
 - Paired
'''

In [None]:
'''
Variance as measure of dispersion
'''
number_values = 50
variances = [1, 10, 50, 200]
label_height_adjust, label_width_adjust = 3, -5

values = lambda variance: norm.rvs(0, math.sqrt(variance), size=number_values, random_state=42)
dffunc = lambda values: pd.DataFrame({'values': values}).explode('values').reset_index(drop=True)
df = dffunc(list(map(values, variances)))
df['Color'] = ((df.index.values % number_values == 0) & (df.index.values > 0)).cumsum()

grid_kw={'wspace': 0}
fig, ax = plt.subplots(1,2,figsize=(16,10), gridspec_kw=grid_kw)
sc = sns.scatterplot(data=df, x=df.index, y='values', hue='Color', palette='Dark2', legend=False, ax=ax[0])
kde = sns.kdeplot(data=df.astype(float), y='values', hue='Color', palette='Dark2', legend=False, ax=ax[1])
kde.axis('off')
sc.axis((0, len(df), None, None))
sc.set_xlabel('Segment examples');sc.set_xticks([]);sc.set_yticks([]);sc.set_ylabel('')
vlines = list(map(lambda x: sc.axvline(x, color='black',alpha=1, lw=0.5), \
            range(0, len(df), number_values)))
hline = sc.axhline(0, color='black')
labels = list(map(lambda zipped: sc.text(zipped[0] + label_width_adjust, df['values'].max() + \
            label_height_adjust, s=f'Variance: {zipped[1]}'), zip(range(int(number_values/3), \
            len(df), number_values), variances)))
fig.savefig('Thesis Figures/Variance_demo.pdf', format='pdf')

In [None]:
'''
Combining 20kb into 5MB
'''
segment_size = 5000000
calls_data = pd.read_csv('DataFrames/calls.tsv', sep='\t')
calls_data['#CHR'] = pd.to_numeric(calls_data['#CHR'].str.replace('chr', ''))
calls_data = calls_data.sort_values(['CELL', '#CHR', 'START']).reset_index(drop=True)
calls_data['SEGMENT'] = ((calls_data['START'] > 0) & (calls_data['START'] % segment_size == 0)).cumsum()
calls_data = calls_data[['#CHR', 'CELL', 'START', 'END', 'COUNT', 'RDR', 'SEGMENT']]

fig, ax = plt.subplots(figsize=(12,4))
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=calls_data[:25].values, colLabels=calls_data[:25].columns, loc='center', colColours=['lightgray'] * 14)
pp = PdfPages("Thesis Figures/Calls_segment.pdf")
pp.savefig(fig, bbox_inches='tight')
pp.close()

In [None]:
'''
Random Cell plot
'''
y_height = -6

random_cell_df = calls_data[calls_data['CELL'] == 'TCAGGATAGACCACGA'].copy().reset_index(drop=True)
random_cell_df['TOTAL'] = random_cell_df['START'].cumsum()
end_of_chromosomes = random_cell_df.drop_duplicates('#CHR', keep='last').reset_index(drop=True)['TOTAL'].values
_positions = np.insert(end_of_chromosomes, 0, 0)
positions = (_positions[1:] + _positions[:-1]) / 2

fig, ax = plt.subplots(figsize=(30,8))
sc = sns.scatterplot(data=random_cell_df, x='TOTAL', y='COUNT', hue='#CHR', palette='Dark2', legend=False, ax=ax)
sc.axis((0, random_cell_df['TOTAL'].max(), 0, 100))
sc.set_xticks([]);sc.set_yticks([]);sc.set_xlabel('');sc.set_ylabel('COUNT')
vlines = list(map(lambda x: sc.axvline(x, color='black',alpha=1, lw=0.5), end_of_chromosomes))
chr_labels = list(map(lambda position:sc.text(position[1], y_height, f'Chr{position[0]}',\
             rotation='vertical', transform=sc.transData), enumerate(positions, start=1)))

fig.savefig('Thesis Figures/cell.pdf', format='pdf')

In [None]:
'''
Dispersion for counts and RDR (Poisson plot)
'''
dispersion_df = calls_data.groupby(['CELL', '#CHR', 'SEGMENT'], sort=False, as_index=False).agg({'COUNT': ['mean', 'var'], \
                'RDR': ['mean', 'var']})
dispersion_df.columns = dispersion_df.columns.map(' '.join).str.strip()
dispersion_df = dispersion_df.replace([np.inf, -np.inf], np.nan).dropna()
sample_dispersion = dispersion_df.sample(50000)

fig, ax = plt.subplots(1, 2, tight_layout=True)
for name in ['COUNT', 'RDR']:
    sc = sns.scatterplot(data=sample_dispersion, x=f'{name} mean',y=f'{name} var', \
         color='black' if name == 'COUNT' else 'orange',ax=ax[0] if name == 'COUNT' else ax[1])
    sc.axis((0, None, 0, sample_dispersion[f'{name} mean'].max()))
    line = sns.lineplot(data=sample_dispersion, x=f'{name} mean', y=f'{name} mean', ax=ax[0] if name == 'COUNT' else ax[1])
fig.savefig('Thesis figures/overdispersion.pdf', format='pdf')

In [None]:
heatmap_df = calls_data.groupby(['CELL', '#CHR'], sort=False, as_index=False).agg({'COUNT': 'sum', 'RDR': 'sum'})
heatmap_dict_count = {cell: [sub['COUNT'].to_numpy()] for (cell, sub) in heatmap_df.groupby('CELL', sort=False, as_index=False)}
heatmap_dict_rdr = {cell: [sub['RDR'].to_numpy()] for (cell, sub) in heatmap_df.groupby('CELL', sort=False, as_index=False)}

count_array = np.concatenate(list(heatmap_dict_count.values()))
rdr_array = np.concatenate(list(heatmap_dict_rdr.values()))


fig, ax = plt.subplots(1,2,tight_layout=True, figsize=(16,10), sharey=True)
c = sns.heatmap(count_array, xticklabels=1, yticklabels=False,cmap='Oranges', ax=ax[0])
r = sns.heatmap(rdr_array, xticklabels=1, yticklabels=False, cmap='Blues',ax=ax[1])

c.set_ylabel('Cell')
c.set_title('Count heatmap')
r.set_title('RDR heatmap')

fig.savefig('Thesis Figures/Heatmaps.pdf', format='pdf')

In [None]:
'''
RESULT - PERFORMANCE
'''
combined_df =  pd.read_csv('DataFrames/Combined_pvalues.csv', index_col=[0])
methods = combined_df.columns.drop(['CELL', 'COUNT Variance', 'RDR Variance']).sort_values()

grid_kw, sub_kw = {'wspace': 0.4, 'hspace': 0.2}, {}
fig, ax = plt.subplots(4, 4, figsize=(30,24), gridspec_kw=grid_kw, subplot_kw=sub_kw)
hist_plots = list(map(lambda method: sns.histplot(data=combined_df, x=method[1], color='red' if method[0] <= 3 else 'black', bins=50, ax=ax[method[0], 0] if method[0] <= 3 else ax[method[0] - 4, 2]), enumerate(methods)))
sc_plots = list(map(lambda method: sns.scatterplot(data=combined_df, x='COUNT Variance' if method[0] <= 3 else 'RDR Variance', y=method[1], color='blue' if method[0] <= 3 else 'orange', ax=ax[method[0], 1] if method[0] <= 3 else ax[method[0] - 4, 3]), enumerate(methods)))
limits = list(map(lambda x: x.axis((0, None, 0, 35)), ([x[0] for x in ax] + [x[2] for x in ax])))

fig.savefig('Thesis Figures/Performance.pdf', format='pdf')

In [None]:
'''
performance against total counts
'''
count_number_df = pd.read_csv('DataFrames/Combined_pvalues.csv', index_col=[0]).copy()
total_counts = calls_data.groupby('CELL', sort=False, as_index=False).agg({'COUNT':'sum', 'RDR': 'sum'})
count_number_df['TOTAL COUNT'] = total_counts['COUNT']
count_number_df['TOTAL RDR'] = total_counts['RDR']



def _plot(axis, method):
    scatter = sns.scatterplot(data=count_number_df, x='TOTAL COUNT', y=method, color='blue', ax=axs[axis])
    scatter.set_ylim()
    scatter.set_yticks([])
    scatter.set_xticks([])
    scatter.set_ylabel('')
    scatter.set_title(method)


methods = count_number_df.columns.drop(['CELL', 'COUNT Variance', 'RDR Variance', 'TOTAL COUNT', 'TOTAL RDR', 'RDR Distance metric', 'RDR Poisson Score','RDR Spikiness', 'RDR T Score']).sort_values()
methods
fig, axs = plt.subplots(1,int(len(methods)), figsize=(16,6), tight_layout=True)
grphs = list(map(lambda method: _plot(*method), enumerate(methods)))
fig.savefig('Thesis Figures/Performance_against_total_count.pdf', format='pdf')

In [None]:
'''
CELL plots for good and bad for each method
'''
plotting_df = combined_df.copy()
good_bad_dict = {x: [plotting_df.sort_values(x, ascending=False).head(1)['CELL'].values[0], plotting_df.sort_values(x, ascending=False).tail(1)['CELL'].values[0]] for x in plotting_df.columns.drop(['CELL', 'COUNT Variance', 'RDR Variance']).sort_values()}

gs_kw = dict(width_ratios=[2] * 2, height_ratios=[1] * 8)
figs, axes = plt.subplots(8, 2, constrained_layout=True, figsize=(70, 50), gridspec_kw=gs_kw, sharex='col', sharey='col')
figs.suptitle('Method Performance', fontsize='xx-large')
figs.set_constrained_layout_pads(w_pad=25/72, h_pad=25/72, hspace=0.05, wspace=0.05)

rnum = -1
for num, (method, cells) in enumerate(good_bad_dict.items()):
    for cell in cells:
        rnum += 1
        cell_df = calls_data[calls_data['CELL'] == cell].copy()
        cell_df['Position'] = cell_df['START'].cumsum()
        s = sns.scatterplot(data=cell_df, x='Position', y='COUNT' if num <= 3 else 'RDR',color='orange' if num <= 3 else 'black',legend=False, ax=axes[rnum, 0] if num <= 3 else axes[rnum - 8, 1])
        s.set_title(f'BEST - method: {method}' if rnum % 2 == 0 else f'WORST - method: {method}')
        s.set_xlim(0, cell_df['Position'].max())
        s.set_ylim((0, 150) if num <= 3 else (None, None))
        s.text(0.8, 1.1, f'Variance: {plotting_df[plotting_df["CELL"] == cell]["COUNT Variance"].values}' if num <= 3 else f'Variance: {plotting_df[plotting_df["CELL"] == cell]["RDR Variance"].values}', bbox=dict(facecolor='red', alpha=0.5), transform=s.transAxes)
        vlines = list(map(lambda x: s.axvline(x), cell_df.drop_duplicates('#CHR', keep='last')['Position']))

figs.savefig('Thesis Figures/CELLS_from_method.pdf', format='pdf')

In [None]:
'''
T distirbution good and bad cells
'''
count_number_df = pd.read_csv('DataFrames/Combined_pvalues.csv', index_col=[0]).sort_values('COUNT T Score', ascending=False)
best_cell, worst_cell = count_number_df['CELL'].head(1).values[0], count_number_df['CELL'].tail(1).values[0]

fig, ax = plt.subplots(2,1)
for num,cell in enumerate([best_cell, worst_cell]):
    plot = calls_data[calls_data['CELL'] == cell].copy()
    sns.scatterplot(data=plot, x='TOTAL', y='COUNT', ax=ax[num])

In [None]:
'''
Variance figure
'''
"""
----Methods----
Often times covariance is calculated instead of varaince but since VAR(X) = COVAR(X,X), for univariate uniform guassions
the covariance is the variance.
Most of these rely on covariance calculators from SKlearn.
'ML'         - is a maximum likelihood method.
'GMM'        - is a gaussian mixture mdodel. The default assumed normals is one, using anything more than this is mostlikely
               pointless as all the observations come from a uniform normal distribution. Better to test this on an actual
               sample of counts across the cell?
'BGMM'       - is a bayesian gaussian mixture model. Again the sample problem applies.
'EC'         - is the empirical covariance. This is also a maximum likelihood estimate.
'EE'         - is the eliptical envelope, useful for detecting outliers, but included anyway.
'LEDOIT WOLF'- calculates estimates using shrinkage.
'MINCOVDET'  - is minimum covariance determinant. Good robust estimator of covariance; uses empiral covariance.
There are a lot of hyperparams that  can be tweaked.
"""
true_mean, true_std = 1, 0.4
errors = []
methods = ['ML', 'GMM', 'BGMM', 'EC', 'EE', 'LEDOIT WOLF', 'MINCOVDET']
for method in methods:
    for observations in [10, 100, 1000, 10000]:
        for repeats in range(1,1001):
            Data = np.random.normal(true_mean, true_std, observations)
            if method == 'ML':
                mu, sigma = norm.fit(Data)
            elif method == 'GMM':
                gm = GaussianMixture().fit(Data.reshape(-1,1))
                mu, sigma = gm.means_[0][0], math.sqrt(gm.covariances_[0][0])
            elif method == 'BGMM':
                bgm = BayesianGaussianMixture().fit(Data.reshape(-1,1))
                mu, sigma = bgm.means_[0][0], math.sqrt(bgm.covariances_[0][0])
            elif method == 'EC':
                cov = EmpiricalCovariance().fit(Data.reshape(-1,1))
                mu, sigma = cov.location_[0], math.sqrt(cov.covariance_[0])
            elif method == 'EE':
                cov = EllipticEnvelope().fit(Data.reshape(-1,1))
                mu, sigma = cov.location_[0], math.sqrt(cov.covariance_[0])
            elif method == 'LEDOIT WOLF':
                cov = LedoitWolf().fit(Data.reshape(-1,1))
                mu, sigma = cov.location_[0], math.sqrt(cov.covariance_[0])
            elif method == 'MINCOVDET':
                cov = MinCovDet().fit(Data.reshape(-1,1))
                mu, sigma = cov.location_[0], math.sqrt(cov.covariance_[0])
            mean_error = abs((true_mean - mu) / true_mean)
            std_error = abs((true_std - sigma) / true_std)
            errors.append({'Mean error': mean_error, 'Standard deviation error': std_error, 'Observations':observations, 'Method': method})
df = pd.DataFrame(errors)

fig, axs = plt.subplots(1,4, figsize=(25,20), tight_layout=True, sharey=True)
for number, value in enumerate([10,100,1000,10000]):
    sns.boxplot(data=df[df['Observations'] == value], x='Observations', y='Standard deviation error', hue='Method', palette='Set1', ax=axs[number])
    axs[number].legend(loc=2, fontsize='x-small', ncol=2, title='Legend (method used)')

fig.savefig('Thesis Figures/Variance experiments.pdf', format='pdf')

In [None]:
'''
example qqplot with poisson
'''
fig, ax = plt.subplots(4, 3, tight_layout=True, sharey='row', sharex='col', figsize=(16,10))

for num, variance in enumerate([10, 60, 150, 250]):
    norm_values = np.array([int(rvs) for rvs in norm.rvs(loc=60, scale=math.sqrt(variance), size=250, random_state=113)])
    pois_values = np.array([int(rvs) for rvs in poisson.rvs(mu=60, size=250, random_state=113)])
    scat = sns.scatterplot(x=np.arange(len(norm_values)),y=norm_values, ax=ax[num, 0])
    hist = sns.histplot(norm_values, ax=ax[num, 1])
    qq = qqplot_2samples(ProbPlot(norm_values), ProbPlot(pois_values), line='45', ax=ax[num, 2])

    ax[num, 0].set_title(f'Variance = {variance}, mean = 60')
    ax[num, 2].set_title('')
    ax[num, 2].set_xlabel('')
    ax[num, 2].set_ylabel('')
    ax[num, 1].set_xlim(left=20, right=105)
    ax[num, 2].set_xlim(left=40, right=85)

fig.savefig('Thesis Figures/poisson_qq.pdf', format='pdf')