# The Full Pipeline

**Inputs**: 
- config file
- image files
- well mask file

**Outputs**:
- droplets DataFrame (csv) 
- wells DataFrame (csv)
- condensed DataFrame (csv)
- jupyter notebook pre-loaded with basic quality control plots

We have built all the necessary parts in other python notebooks and moved code to kchip_v0 package. Now, implement each step. 

#### Step 1: Create droplets DataFrame
The droplets dataFrame contains information of all droplets in the pre-merge image set. We need to compute and store:
- the RGB information of each droplet
- the cluster and cluster label of each droplet
- the location of each droplet
- the well ID of each droplet

All of these steps can be found in the "Putting it together" notebook. 

#### Step 2: Create wells DataFrame
- Loop through post-merge images and identify wells
- Map post-merge wells to pre-merge wells
- Condense outputs to final dataframe

All of these steps can be found in the "Registration" notebook and Final Outputs. 


### Step 0: Imports and config

In [None]:
# basic imports 
import yaml
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Append top level directory with kchip package
import sys
sys.path.append('../')

# kchip imports
import kchip.io as kchip_io
import kchip.analyze as kchip_analyze

# Other
%matplotlib inline

# Read in config file
with open('config.yml', 'r') as ymlfile:
    config = yaml.load(ymlfile)
    
print yaml.dump(config)

In [None]:
fontsize = 14
plt.rcParams['axes.spines.right']=False
plt.rcParams['axes.spines.top']=False

plt.rcParams['axes.linewidth']=3
plt.rcParams['axes.labelsize']=fontsize
plt.rcParams['lines.linewidth']=2
plt.rcParams['xtick.labelsize']=fontsize
plt.rcParams['ytick.labelsize']=fontsize
plt.rcParams['axes.titlesize'] = fontsize
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica']
plt.rcParams['font.size']=fontsize
plt.rcParams['xtick.major.width']=1.5
plt.rcParams['ytick.major.width']=1.5
plt.rcParams['contour.negative_linestyle'] = 'solid'

plt.rcParams['savefig.bbox']='Tight'
plt.rcParams['pdf.fonttype'] = 42

### Step 1: Create droplets DataFrame
- Initialize from images
- Identify droplets in the same well from fit to masks
- Cluster
- Map apriori labels to clusters

In [None]:
# Initialize droplets DataFrame from images
droplets, rotation_theta = kchip_analyze.initialize_droplets(config)

print 'Rotation (degrees): ', rotation_theta*180/np.pi

# Identify droplets in the same well from fit to masks
droplets = kchip_analyze.fit_droplets_to_mask(config,droplets,rotation_theta)

# Cluster droplets and map apriori labels to clusters
droplets, centroids = kchip_analyze.identify_clusters(config,droplets,show=1)

print 'Total droplets identified: ', droplets.shape[0]

### Step 2: Create wells DataFrame
- Create pre-merge wells dataFrame
- Loop through post-merge images and identify wells
- Map post-merge wells to pre-merge wells

In [None]:
# Identify premerge wells
pre_wells = droplets.groupby(['IndexX','IndexY','Well_ID'],as_index=False)[['ImageX','ImageY','Edge']].mean()

# List of timepoints
timepoints = ['t'+str(i) for i in range(3)]

# Analyze data for each timepoint
pre_post_all = []
for timepoint in timepoints:
    print 'Now analyzing timepoint: ', timepoint
    # Identify postmerge wells and map to pre-merge wells
    pre_post_all.append(kchip_analyze.map_pre_to_post(config,timepoint,pre_wells))
    pre_post_all[-1].to_csv(timepoint+'.csv')

# Condense output
condensed = kchip_analyze.stack_timepoints(droplets,pre_post_all,timepoints)

In [None]:
# Save outputs
droplets.to_csv('droplets.csv')
condensed.to_csv('output.csv')

## Quality control outputs

### Chip loading and global well positions

In [None]:
well_positions = pre_post_all[-1].groupby('Hash').mean()[['Pre_GlobalX','Pre_GlobalY']].values
edge = pre_post_all[-1].groupby('Hash').mean()['Pre_Edge'].values

fig, axes = plt.subplots(figsize=(10,10))

axes.plot(well_positions[:,0],-well_positions[:,1],'.',ms=2)
axes.plot(well_positions[edge,0],-well_positions[edge,1],'.',ms=2)
plt.axis('off');

### Plot by total droplets in well

In [None]:
def plot_by_count(df,ax):
    for item in df['Total'].unique():
        pos = df[(df['Total']==item)][['Pre_GlobalX','Pre_GlobalY']].values
        ax.plot(pos[:,0],-pos[:,1],'.',ms=2)
    return ax

fig, axes = plt.subplots(figsize=(10,10))
        
condensed[['Hash','Total','t2_Area']] \
    .merge(pre_post_all[-1][['Hash','Pre_GlobalX','Pre_GlobalY','Pre_Edge']],on='Hash') \
    .pipe(plot_by_count,ax=axes) 
    
axes.legend(condensed['Total'].unique(),loc=2,bbox_to_anchor=(1.05,1))
plt.axis('off');

### Plot area over chip

In [None]:
from matplotlib import colors

data = condensed[['Hash','Total','t2_Area']] \
    .merge(pre_post_all[-1][['Hash','Pre_GlobalX','Pre_GlobalY','Pre_Edge']],on='Hash') 

width = 1e1
bins = np.arange(0,2e3,width)

bin_by_area = pd.cut(data['t2_Area'],bins)
bin_label = dict([(item,i) for i, item in enumerate(bin_by_area.unique().sort_values())])

data['Area_Bin']=[bin_label[item] for item in bin_by_area]

cmap = colors.LinearSegmentedColormap.from_list('', ['green','yellow','red','violet'],N=data['Area_Bin'].max())

fig, axes = plt.subplots(1,2,figsize=(20,10))
axes[0].scatter(data['Pre_GlobalX'],-data['Pre_GlobalY'],c=data['Area_Bin'],cmap=cmap,marker='.',s=10,alpha=1)
axes[0].axis('off')

invert_label = {v: k for k, v in bin_label.iteritems()}

for label in invert_label.keys():
    if invert_label[label]==invert_label[label]:
        axes[1].bar(invert_label[label].mid,(data['Area_Bin']==label).sum(),width=width,color=cmap(label))

axes[1].set_xlabel('Area')
axes[1].set_ylabel('Count')

### Representation

In [None]:
import seaborn as sns

fig, ax = plt.subplots()

p = sns.pointplot(data=droplets.groupby('Label').count().reset_index(),x='Label',y='RX',join=False)

p.set_xticklabels(p.get_xticklabels(),rotation=90)
p.set_ylim([0,droplets.groupby('Label').count()['RX'].max()*1.5]);
p.set_ylabel('Count')
p.set_title('Representation')

### Histograms of GFP and Area values

In [None]:
timepoints = condensed.filter(regex='t\d+(?!_)').columns.values

fig, axes = plt.subplots(1,len(timepoints),figsize=(4*len(timepoints),4))

bins = np.arange(0,1e4,1e2)

for ax, t in zip(axes,timepoints):
    ax.hist(condensed[t].dropna(),bins=bins)
    ax.set_xlabel('GFP Fluorescence')
    ax.set_ylabel('Counts')
    ax.set_title('Signal at '+t)

plt.tight_layout()

In [None]:
timepoints = condensed.filter(regex='t\d+_Area').columns.values

fig, axes = plt.subplots(1,len(timepoints),figsize=(4*len(timepoints),4))

bins = np.arange(0,2e3,1e1)

for ax, t in zip(axes,timepoints):
    ax.hist(condensed[t].dropna(),bins=bins)
    ax.set_xlabel('Area')
    ax.set_ylabel('Counts')
    ax.set_title('Area at '+t[:-5])

plt.tight_layout()