# PyClonal 

In [1]:
%%html
<img src="image.jpg">

# Calculating Overlap Between Samples

The purpse of this notebook is to analyze TCR receptor sequencing data. The input data for this notebook is the TCR sequencing data post-aligment. For examples of what this data looks like go here. (add link to a page that talks about input data types)

In [2]:
from plotly.offline import init_notebook_mode, iplot
import plotly.plotly as py
import plotly.graph_objs as go
from IPython.html.widgets import interact
init_notebook_mode(connected=True)

from plotly.offline import iplot

from scipy.spatial.distance import cosine, jaccard

import pandas as pd
import numpy as np


The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.



## Step 1: Load your TCR data

In [3]:
basepath = "../sample_input_files/miron-changeo(1)/"
files = ["D233.changeo.tsv", "D255.changeo.tsv", "D280.changeo.tsv", "D287.changeo.tsv", "D299.changeo.tsv" ]

# change these header based on the data type 
# below format for changeo
COUNTS_HEADER = "DUPCOUNT"
SEQ_HEADER = "CLONE_CDR3_AA"
SAMPLE_HEADER = "SAMPLE"

## Step 2: Load your meta data

In [4]:
# =======================================================
# Avoid changes below this
# =======================================================

def get_df(df, feature, token, index):
    new_df = df[ df[feature] == token ]
    del new_df[feature]
    new_df = new_df.groupby(index).sum()
    
    return new_df

for filename in files:
    filepath = basepath + filename
    
    # extract relevant data only
    df = pd.read_table(filepath, sep="\t").loc[:, [ COUNTS_HEADER, SEQ_HEADER, SAMPLE_HEADER]]
    
    # group by sum and melting the data frame
    gp = df.groupby([ SAMPLE_HEADER, SEQ_HEADER]).sum().reset_index()
    
    # extracting unique file related metadata
    seqs = set(gp[SEQ_HEADER].values)
    samples = set(gp[SAMPLE_HEADER].values)
    
    # make a dictionary with the sequence indices
    seq_indices = {}
    for index, seq in enumerate(seqs):
        seq_indices[index] = seq
    
    # making relevant dataframe for meta data
    sample_meta_data = {}
    for sample in samples:
        toks = sample.strip().split("-")
        meta_data = {}

        meta_data['patient_id'] = toks[0]
        meta_data['tissue_id'] = toks[1]
        meta_data['protein'] = toks[2]
        meta_data['cell_type'] = toks[3]
        meta_data['replicate'] = toks[4]
        meta_data['extras'] = toks[5:]

        sample_meta_data[sample] = meta_data
    
    # No aggregate step for now
    distances = {'cosine': [], 'jaccard': []}
    for sample1 in list(samples):
        for sample2 in list(samples):
            
            if sample1 == sample2:
                distances['cosine'].append(0.0)
                distances['jaccard'].append(1.0)
                continue
            
            ct = pd.concat([ get_df(df, SAMPLE_HEADER, sample1, SEQ_HEADER),
                             get_df(df, SAMPLE_HEADER, sample2, SEQ_HEADER)], axis=1 ).fillna(0)
            ct.columns = [sample1, sample2]

            dist = cosine(ct[sample1].values, ct[sample2].values)
            distances['cosine'].append(dist)
            
            dist = jaccard(ct[sample1].values, ct[sample2].values)
            distances['jaccard'].append(dist)
    
    # currently perfroming for one file
    # can be easily extended to all
    break

# Step 3: Make plots

In [5]:
def plot_heatmap(method):
    METRIC = method

    num_rows = len(samples)
    num_cols = int(len(distances[METRIC]) / num_rows)
    dist_matrix = np.array( distances[METRIC]).reshape((num_rows, num_cols))

    trace = go.Heatmap(z = dist_matrix,
                       x = list(samples),
                       y = list(samples) )
    data=[trace]
    iplot(data, filename=method+' Distance')

In [6]:
interact(plot_heatmap, method=['jaccard', 'cosine'])

interactive(children=(Dropdown(description='method', options=('jaccard', 'cosine'), value='jaccard'), Output()…

<function __main__.plot_heatmap>

# Export your data