In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math
import itertools

from scipy.stats import kstest
from scipy.stats import poisson
from scipy.stats import norm
from scipy.stats import t
from scipy.stats import chisquare

from statsmodels.stats.gof import gof_binning_discrete

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import BayesianGaussianMixture
from sklearn import mixture


from scipy import linalg
import matplotlib as mpl
from collections import Counter

%matplotlib inline
plt.style.use('ggplot')
sns.set_style('whitegrid')

In [None]:
segment_size = 5000000
calls = pd.read_csv('DataFrames/calls.tsv', sep='\t')
calls['#CHR'] = pd.to_numeric(calls['#CHR'].str.replace('chr', ''))
calls = calls.sort_values(['CELL', '#CHR', 'START']).reset_index(drop=True)
calls['SEGMENT'] = ((calls['START'] > 0) & (calls['START'] % segment_size == 0)).cumsum()
calls['TOTAL'] = calls['START'].cumsum()
calls


In [None]:
def spikiness(series):
    if sum(series.values) == 0:
        return np.nan
    return sum(abs(series[1:].values - series[:-1].values)) / sum(series.values)

In [None]:
sp_df = calls.groupby(['CELL','SEGMENT'], sort=False, as_index=False).agg({'COUNT': ['var', spikiness], 'RDR': ['var', spikiness]})

In [None]:
sp_df.replace([np.inf, -np.inf], np.nan).dropna()
sp_df.columns = sp_df.columns.map(' '.join).str.strip()
sp_df

In [None]:
spdf = sp_df.groupby('CELL', sort=False, as_index=False).agg('median')
spdf

In [None]:
ce = sp_df[sp_df['CELL'] == 'AAACCTGCAGGACCAA']
np.median(ce['COUNT spikiness'].values)

In [None]:
count_number_df = pd.read_csv('DataFrames/Combined_pvalues.csv', index_col=[0]).sort_values('RDR T Score', ascending=False)
best_cell, worst_cell = count_number_df['CELL'].head(1).values[0], count_number_df['CELL'].tail(1).values[0]

y_height = -12
gs_kw = dict(width_ratios=[2] * 1, height_ratios=[1] * 2)
fig, ax = plt.subplots(2,1, figsize=(16,10), sharex=True, gridspec_kw=gs_kw, tight_layout=True)
for num,cell in enumerate([best_cell, worst_cell]):
    plot = calls[calls['CELL'] == cell].copy()
    plot['TOTAL'] = plot['START'].cumsum()

    end_of_chromosomes = plot.drop_duplicates('#CHR', keep='last').reset_index(drop=True)['TOTAL'].values
    _positions = np.insert(end_of_chromosomes, 0, 0)
    positions = (_positions[1:] + _positions[:-1]) / 2

    s = sns.scatterplot(data=plot, x='TOTAL', y='COUNT',hue='#CHR', legend=False, palette='Dark2',ax=ax[num])
    s.set_title('BEST CELL' if num == 0 else 'WORST CELL')
    s.axis((0, plot['TOTAL'].max(), 0, 150))
    s.set_xticks([]);s.set_yticks([]);s.set_xlabel('');s.set_ylabel('COUNT')
    if num == 1:
        chr_labels = list(map(lambda position:s.text(position[1], y_height, f'Chr{position[0]}',\
             rotation='vertical', transform=s.transData), enumerate(positions, start=1)))
    vlines = list(map(lambda x: s.axvline(x, color='black',alpha=1, lw=0.5), end_of_chromosomes))

fig.savefig('Thesis Figures/T_count_example.pdf', format='pdf')


In [None]:
best_cell

In [None]:
calls[calls['CELL'] == worst_cell]['RDR'].var()