# Notebook to plot correlation between technical replicates for each condition

## Import libraries

In [1]:
import pandas as pd
import re
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['svg.fonttype'] = 'none'

## Specify paths

In [2]:
aggdata_outpath = '../aggregated_data/'
diagnostics_graph_outpath = '../diagnostics_graphs/'

## Reshaping

In [3]:
master = pd.read_csv(f'{aggdata_outpath}/master.csv', index_col=0)
p = re.compile(r'selcoeff_[1-3]')
replicates = [x for x in master.columns if bool(p.fullmatch(x))]
replicates

['selcoeff_1', 'selcoeff_2', 'selcoeff_3']

In [4]:
rep_combinations = list(itertools.combinations(replicates, 2))
rep_combinations

[('selcoeff_1', 'selcoeff_2'),
 ('selcoeff_1', 'selcoeff_3'),
 ('selcoeff_2', 'selcoeff_3')]

In [5]:
NT = master[master.pool_type == 'single'
           ].groupby(['strain','locus','pool_type','compound','nt_seq']
                    )[replicates].first().reset_index()
NT

Unnamed: 0,strain,locus,pool_type,compound,nt_seq,selcoeff_1,selcoeff_2,selcoeff_3
0,BY4741,FKS1-HS1,single,anidulafungin,aagttagttttatctttgagagatcca,2.097608,2.009412,
1,BY4741,FKS1-HS1,single,anidulafungin,aatttagttttatctttgagagatcca,1.439980,1.243542,
2,BY4741,FKS1-HS1,single,anidulafungin,acgttagttttatctttgagagatcca,1.621938,1.475230,
3,BY4741,FKS1-HS1,single,anidulafungin,actttagttttatctttgagagatcca,2.127046,2.006135,
4,BY4741,FKS1-HS1,single,anidulafungin,aggttagttttatctttgagagatcca,2.322987,1.999431,
...,...,...,...,...,...,...,...,...
7628,R1158,FKS2-HS2,single,none,tcttggttgaaaagatgtgttatt,0.094955,-0.216749,-0.016723
7629,R1158,FKS2-HS2,single,none,tggtgggtgagacgttatacactc,-0.040170,-0.040532,-0.014094
7630,R1158,FKS2-HS2,single,none,ttgtggattaaaagaactattatt,-0.014166,0.001832,0.005281
7631,R1158,FKS2-HS2,single,none,ttgtgggtgagacgttatacactc,-0.030758,0.025455,-0.010405


In [6]:
gby = NT.groupby(['strain','locus','compound'])[replicates].nunique()
gby

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,selcoeff_1,selcoeff_2,selcoeff_3
strain,locus,compound,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BY4741,FKS1-HS1,anidulafungin,307,305,0
BY4741,FKS1-HS1,caspofungin,306,302,0
BY4741,FKS1-HS1,micafungin,310,303,0
BY4741,FKS1-HS1,none,309,309,0
BY4741,FKS1-HS2,anidulafungin,212,210,207
BY4741,FKS1-HS2,caspofungin,207,204,204
BY4741,FKS1-HS2,micafungin,209,202,204
BY4741,FKS1-HS2,none,213,214,207
BY4741,FKS2-HS1,none,287,288,287
BY4741,FKS2-HS2,none,253,254,254


In [7]:
def rep_pairwise(df,s,l,c,rc):
    sns.set(rc = {
              'font.family':'Arial',
              'font.size':8,
              'legend.title_fontsize':8, 'legend.fontsize':8,
              'axes.labelsize':8,'axes.titlesize':8,
              'xtick.labelsize':8, 'ytick.labelsize':8,
              'xtick.major.pad':2, 'ytick.major.pad':2,
              'xtick.bottom': True, 'ytick.left': True,
              'xtick.major.size':2, 'ytick.major.size':2,
             },
       style='ticks')
    
    graphdf = df[(df.strain == s)
                 & (df.locus == l)
                 & (df.compound == c)
                ]
    
    # Get Pearson correlation coefficient
    from scipy import stats

    pearson_dict = {}
    for x in rc:
        if (graphdf[x[0]].nunique() > 0) & (graphdf[x[1]].nunique() > 0):
            pearson_dict[x] = stats.pearsonr(graphdf[x[0]], graphdf[x[1]])[0]
    
    # Draw plot
    graphdf.rename(columns={'selcoeff_1':'s (Rep. 1)',
                            'selcoeff_2':'s (Rep. 2)',
                            'selcoeff_3':'s (Rep. 3)'
                           }, inplace=True)
    
    fig = sns.pairplot(graphdf, diag_kind='kde', corner=True,
                       plot_kws={'color':'k', 'alpha':.2},
                       diag_kws={'color':'k'},
                       height=1)
    
    # axes[i][j] corresponds to (replicate j+1, replicate i+1)
    # (replicate x, replicate y) corresponds to axes[y-1][x-1]
    for k,v in pearson_dict.items():
        fig.axes[int(k[1][-1]) - 1
                ][int(k[0][-1]) -1
                 ].annotate(f'{v:.2}',(0,0), ha='center', va='center',
                            bbox=dict(facecolor='white', alpha=.8,
                                      edgecolor='black', boxstyle='round,pad=.5')
                           )
    plt.annotate(f'{s}\n{l}\n{c.title()}', (.75,.5), xycoords='figure fraction')
    
    plt.savefig(f'{diagnostics_graph_outpath}/{s}_{l}_{c}_Pearson.svg', format='svg', dpi=300)
    plt.close()
    return

In [8]:
for g in gby.index:
    rep_pairwise(NT, g[0], g[1], g[2], rep_combinations)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  graphdf.rename(columns={'selcoeff_1':'s (Rep. 1)',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  graphdf.rename(columns={'selcoeff_1':'s (Rep. 1)',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  graphdf.rename(columns={'selcoeff_1':'s (Rep. 1)',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  graphdf.rename(