In [None]:
#import libraries 
import pandas as pd
import os
import numpy as np
import scipy as sp
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout
from plotly import offline as plotly_offline
plotly_offline.init_notebook_mode()


from numpy import arange,array,ones
from scipy import stats
from scipy.optimize import curve_fit
from scipy import log as log
import matplotlib.pyplot as plt

import seaborn as sns
import pylab

In [None]:
#copy the path of the destination folder where the input data is stored.
WORKSPACE_DIRECTORY = "example_data"

In [None]:
#determine cumulative distribtion of two libraries

# User parameters - customize for count or enrichment (or other property)
LIBRARY_1_NAME = "Library 1"
LIBRARY_2_NAME = "Library 2"
LIBRARY_1_FILE_NAME = "filex1.xlsx"
LIBRARY_2_FILE_NAME = "filex2.xlsx"
LIBRARY_1_COLUMN_NAME = "Library1_enrichment"
LIBRARY_2_COLUMN_NAME = "Library2_enrichment"
X_AXIS_NAME = "Enrichment"
CHART_TITLE = "Enrichment cumulative distribution"


#library 1 data
#enter the input data ".xlsx" file.
file1 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_1_FILE_NAME)
refile1 = pd.read_excel(file1)
data1 = refile1[LIBRARY_1_COLUMN_NAME] #mention the column name of the data that is analyzed.
x1 = np.sort(data1) 
y1 = np.arange(1, len(x1)+1)/len(x1)   
plt.plot(x1, y1, c='blue', label=LIBRARY_1_NAME)  #plots in blue, provide legend name

#library 2 data - do the same as above if data comes from a different input file
file2 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_2_FILE_NAME)
refile2 = pd.read_excel(file2)
data2 = refile2[LIBRARY_1_COLUMN_NAME] #mention the column name of the data that is analyzed.
#data2 = refile1.column_name2  # use this code instead of the previous line if 2nd data comes from the same file as data1.Mention the column name of data.
x2 = np.sort(data2) 
y2 = np.arange(1, len(x2)+1)/len(x2) 
plt.plot(x2, y2, c='red', label=LIBRARY_2_NAME)   #plots in red, provide legend name

#plot features
plt.xscale('log')   #x-axis scale is set to log
plt.ylabel('ECDF')  #y-axis label: ECDF - emperical cummulative distribution frequency
plt.xlabel('add label')  #mention your label
plt.margins(0.02) #keeps data off plot edges
plt.title(CHART_TITLE) # assign a title for the plot
plt.legend(loc='upper right')   #legend is displayed on the upper right corner
#plt.axis([0,3.5,0,10000000])   #setting axis range is optional 
#plt.show()   #do not use this option if you want to use the savefig function below.
plt.savefig('cumulative_distributionplot.png', format='png', dpi=300)  #save the file as png

In [None]:
#Using Plotly software to make interactive plots of library distributions

# User parameters
LIBRARY_1_NAME = "Library 1"
LIBRARY_2_NAME = "Library 2"
LIBRARY_1_FILE_NAME = "filex1.xlsx"
LIBRARY_2_FILE_NAME = "filex2.xlsx"
LIBRARY_1_COLUMN_NAME = "Library1_enrichment"
LIBRARY_2_COLUMN_NAME = "Library2_enrichment"
X_AXIS_NAME = "Enrichment"
CHART_TITLE = "Enrichment cumulative distribution"
SEQUENCE_COLUMN_NAME = "7mer_Amino Acid"

#library 1 data
#enter the input data ".xlsx" file.
file1 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_1_FILE_NAME)
refile1 = pd.read_excel(file1)
#optional - sort the column by decreasing order of values
refile1sort = refile1.sort_values(by=LIBRARY_1_COLUMN_NAME, ascending=False)  #enter the column name
#mention column name "column_name1" of data to be analyzed
data1 = refile1sort[LIBRARY_1_COLUMN_NAME]  
#optional; mention the name of the column "column_namex1" for hovering text over plot
text1 = refile1sort[SEQUENCE_COLUMN_NAME]

#for multiple traces on the same plot repeat the above codes 
file2 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_2_FILE_NAME)
refile2 = pd.read_excel(file2)
refile2sort = refile2.sort_values(by=LIBRARY_2_COLUMN_NAME, ascending=False)  #enter the column name
data2 = refile2sort[LIBRARY_2_COLUMN_NAME] 
text2 = refile2sort[SEQUENCE_COLUMN_NAME]

trace1 = go.Scatter(
        y= data1,  
        text = text1, #optional for interactive plot
        mode= 'markers',
        marker=dict(symbol='circle-open', size=5, color='rgb(206, 137, 0)'),
        name= LIBRARY_1_NAME
        )
trace2 = go.Scatter(
        y= data2,  
        text = text2,   #optional for interactive plot
        mode= 'markers',
        marker=dict(symbol='circle-open', size=5, color='rgb(102,204,0)'),
        name= LIBRARY_2_NAME
        )

data = [trace1, trace2]  #plot all traces

#plot layout parameters
layout = go.Layout(
    title=CHART_TITLE,  
    #width=800, 
	#height=500,
    #hovermode = 'closest',
    font=dict(
            family='Arial, monospace',
            size=18,
            color='#000000'
            ),
    xaxis=dict(
        title='add xaxis title',
        titlefont=dict(
            family='Arial, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='add yaxis title',
        #type ='log',
        titlefont=dict(
            family='Arial, monospace',
            size=18,
            color='#000000'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
plotly_offline.iplot(fig, filename='Plotly_interative_lib_distribution.html')

In [None]:
# Code to determine standard score of a raw data. 

LIBRARY_FILE_NAME = "filex1.xlsx"
LIBRARY_COLUMN_NAME = "Library1_enrichment"
STANDARDIZED_OUTPUT_FILE_NAME = "filex_stdscore.xlsx"
 
#enter the input data ".xlsx" file.
file1 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_FILE_NAME)
refile1 = pd.read_excel(file1)
data1 = refile1[LIBRARY_COLUMN_NAME] #mention column name of data to be analyzed
refile1['std_data1'] = (data1-data1.mean())/data1.std()   #replace string with desired name for the new column with standard score
#optional sorting by standard score data
refile1_sort = refile1.sort_values(by= 'std_data1', ascending=False)
refile1_sort.to_excel(STANDARDIZED_OUTPUT_FILE_NAME)  #save as excel sheet, replace string with desired file name


In [None]:
#code to determine the enrichment score of each variant across a whole library (or specific column)

LIBRARY_FILE_NAME = "filex1.xlsx"
LIBRARY_1_COLUMN_NAME = "Library1_enrichment"
LIBRARY_2_COLUMN_NAME = "Library2_enrichment"
ENRICHMENT_OUTPUT_FILE_NAME = "filex1_data1enrich.xlsx"
 
#enter the input data ".xlsx" file.
file1 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_FILE_NAME)
refile1 = pd.read_excel(file1)

#mention column name that has the un-normalized raw data
data1 = refile1[LIBRARY_1_COLUMN_NAME]
#mention column name that is used for normalize the raw data
norm = refile1[LIBRARY_2_COLUMN_NAME]

data1s = data1.sum()
data1n = np.divide(data1,data1s)
norms = norm.sum()
normn = np.divide(norm,norms)
data_n = np.divide(data1n, normn)
refile1['data1_enrich'] = np.log10(data_n)  #add a column name for the enrichment data

#optional sort by descreasing order of enrichment score
refile1sort = refile1.sort_values(by='data1_enrich', ascending=False)

refile1sort.to_csv(ENRICHMENT_OUTPUT_FILE_NAME)

In [None]:
#code to plot multiple histograms on the same plot

# User parameters - customize for count or enrichment (or other property)
LIBRARY_1_FILE_NAME = "filex1.xlsx"
LIBRARY_2_FILE_NAME = "filex2.xlsx"
LIBRARY_1_COLUMN_NAME = "Library1_enrichment"
LIBRARY_2_COLUMN_NAME = "Library2_enrichment"


#library 1 data
#enter the input data ".xlsx" file.
file1 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_1_FILE_NAME)
refile1 = pd.read_excel(file1)

#library 2 data
#enter the input data ".xlsx" file.
file2 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_2_FILE_NAME)
refile2 = pd.read_excel(file2)

#mention the name of columns that are analyzed from different files
x1 = refile1[LIBRARY_1_COLUMN_NAME]
x2 = refile2[LIBRARY_2_COLUMN_NAME]

#log scale
data1 = np.log10(x1)
data2 = np.log10(x2)

#plot multiple histograms together
ax = sns.distplot(data1, hist=True, kde=False, label='legend_name1', color='#ff8b94', hist_kws={"alpha": 0.5})
ax = sns.distplot(data2, hist=True, kde=False,  label='legend_name2', color='#ff2560', hist_kws={"alpha": 0.5})
ax.set_facecolor("#FFFFFF")
plt.savefig('Seaborn_plot_histogram.png', format='png', dpi=1000)


In [None]:
#correlation analysis between two libraries

LIBRARY_FILE_NAME = "filex1.xlsx"
LIBRARY_1_COLUMN_NAME = "Library1_enrichment"
LIBRARY_2_COLUMN_NAME = "Library2_enrichment"
ENRICHMENT_OUTPUT_FILE_NAME = "filex1_data1enrich.xlsx"
 
#enter the input data ".xlsx" file.
file1 = os.path.join(WORKSPACE_DIRECTORY, LIBRARY_FILE_NAME)
refile1 = pd.read_excel(file1)

#mention the column names of data that are analyzed.
x = refile1[LIBRARY_1_COLUMN_NAME]  
y = refile1[LIBRARY_2_COLUMN_NAME]

slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
line = slope*x+intercept
plt.plot(x, y,'o', x, line, '-k', color='grey')
pylab.title('plot title')  #add plot title
ax = plt.gca()
ax.set_facecolor("#FFFFFF")
plt.savefig('Seaborn_correlationplot.png', format='png', dpi=1000)  #save file with desired name
print("r-squared:", r_value**2)
print("p-value:", p_value)
print("slope:", slope)
print("std-err:", std_err)