In [None]:
import pandas as pd
from peseq.analysis import heat
from peseq.utils import DNA
import numpy as np
import os

%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = [12, 10]

In [None]:
# User parameters

#set your path, where your variant Excel file is located
WORKSPACE_DIRECTORY = "example_data/"

# The name of your variant Excel file
FILE_NAME = "filex.xlsx"

# optional, sort a library 'column_name1" by decreasing order of its value or enrichment score.
# Set to none if sorting is not needed
SORTED_COLUMN_NAME = "Library1_enrichment"
# SORTED_COLUMN_NAME = None

# File name of sorted output, if desired
USE_ENRICHMENT_SCORE = True

#if you want to add multiple libraries by summing all enrichment scores together, use this code. Otherwise comment this.
ENRICHMENT_COLUMN_NAMES = ["Library1_enrichment", "Library2_enrichment"]

#input the number of sequences you want to include from your library. Since my library is sorted, number 25 is looking at top 25 enriched sequences from a specific library.
NUM_SEQUENCES = 2

#mention the column name that has the list of amino acid sequences from that library
SEQUENCE_COLUMN_NAME = "7mer_Amino Acid"

# Choose which template to use for normalization
TEMPLATE = "7-mer-i NNK"
# TEMPLATE = "3-mer-s XXX"

#type the number of diversified region( 7 for 7mer-i library)
SEQUENCE_LENGTH= 7
#SEQUENCE_LENGTH=11 # for 11mer seq #input the length of your diversified sequence (this case its a 11-mer)

# Name of the file for outputting the amino acid position frequencies
OUTPUT_FILE_NAME = 'library-top25.csv'

# Title for the inline heatmaps
HEATMAP_TITLE = "Library 1 and 2 AA distribution"

In [None]:
file = os.path.join(WORKSPACE_DIRECTORY, FILE_NAME)
df = pd.read_excel(file)
# df.head()

In [None]:
if SORTED_COLUMN_NAME:
    df.sort_values(by=SORTED_COLUMN_NAME, ascending=False)
    
    if not USE_ENRICHMENT_SCORE:
        df['seq_1'] = 1     #assign an equal score of 1 to all the variants in the library to disregard the enrichment score in heatmap analysis.
#     df.to_csv('R2_DNA_Virus_pool_dna20_virus20_seq1.csv')   #optional

In [None]:
columns = DNA.get_amino_acids()
print(columns)

In [None]:
Rows = list(range(SEQUENCE_LENGTH))[::-1]
print(Rows)

In [None]:
table = np.zeros((SEQUENCE_LENGTH, len(columns)))
# print(table)

In [None]:
table = np.zeros((SEQUENCE_LENGTH, len(columns)))
for i in range(NUM_SEQUENCES):
    
    # If we're not using the enrichment score, then use an equal score for all variants
    if not USE_ENRICHMENT_SCORE:
        #this line specifies the library that is used to generate heatmap with equal enrichment score for 25 sequences. set yours.
        enrich = df.iloc[i,:]['seq_1']
    else:
        enrich = df.iloc[i,:][ENRICHMENT_COLUMN_NAMES[0]]
        for column in ENRICHMENT_COLUMN_NAMES[1:]:
            enrich += df.iloc[i,:][column]
    aminoacid = df.iloc[i,:][SEQUENCE_COLUMN_NAME]  
    #print(aminoacid)
    for j in range(NUM_SEQUENCES):
        col = aminoacid[j]
        pos = columns.index(col)
        #print(col,pos)
        table[j,pos]+= enrich
print(table)    

In [None]:
heatmap = heat.heatmap(title=HEATMAP_TITLE,data=table,y_labels=Rows,x_labels=columns)
heat.heatmap.draw(heatmap)

In [None]:
#normalization

if TEMPLATE == "7-mer-i NNK":
    natural_frequency = np.array([0.063,0.031,0.031,0.031,0.031,0.063,0.031,0.031,0.031,0.094,0.031,0.031,0.063,0.031,0.094,0.094,0.063,0.063,0.031,0.063])
elif TEMPLATE == "3-mer-s XXX":
    natural_frequency = np.array([[0.187, 0.031, 0.031, 0.031, 0.031, 0.063, 0.031, 0.031, 0.031, 0.094, 0.031, 0.031, 0.063, 0.031, 0.094, 0.094, 0.063, 0.063, 0.031, 0.063],
                      [0.063, 0.031, 0.031, 0.031, 0.031, 0.063, 0.031, 0.031, 0.031, 0.094, 0.031, 0.031, 0.063, 0.155, 0.094, 0.094, 0.063, 0.063, 0.031, 0.063],
                      [0.126, 0.062, 0.062, 0.062, 0.062, 0.126, 0.062, 0.062, 0.062, 0.188, 0.062, 0.062, 0.126, 0.062, 0.188, 0.188, 0.219, 0.126, 0.062, 0.126],
                      [0.063, 0.031, 0.031, 0.031, 0.031, 0.063, 0.031, 0.031, 0.031, 0.218, 0.031, 0.031, 0.063, 0.031, 0.094, 0.094, 0.063, 0.063, 0.031, 0.063], 
                      [0.219, 0.062, 0.062, 0.062, 0.062, 0.126, 0.062, 0.062, 0.062, 0.188, 0.062, 0.062, 0.126, 0.062, 0.188, 0.188, 0.126, 0.126, 0.062, 0.126], 
                      [0.063, 0.031, 0.031, 0.031, 0.031, 0.063, 0.031, 0.031, 0.031, 0.094, 0.031, 0.031, 0.063, 0.031, 0.094, 0.094, 0.063, 0.187, 0.031, 0.063],
                      [0.126, 0.062, 0.062, 0.062, 0.062, 0.126, 0.062, 0.062, 0.062, 0.188, 0.062, 0.062, 0.219, 0.062, 0.188, 0.188, 0.126, 0.126, 0.062, 0.126],
                      [0.063, 0.031, 0.031, 0.031, 0.155, 0.063, 0.031, 0.031, 0.031, 0.094, 0.031, 0.031, 0.063, 0.031, 0.094, 0.094, 0.063, 0.063, 0.031, 0.063],
                      [0.126, 0.062, 0.062, 0.062, 0.062, 0.126, 0.062, 0.062, 0.155, 0.188, 0.062, 0.062, 0.126, 0.062, 0.188, 0.188, 0.126, 0.126, 0.062, 0.126],
                      [0.187, 0.031, 0.031, 0.031, 0.031, 0.063, 0.031, 0.031, 0.031, 0.094, 0.031, 0.031, 0.063, 0.031, 0.094, 0.094, 0.063, 0.063, 0.031, 0.063],
                      [0.063, 0.031, 0.031, 0.031, 0.031, 0.063, 0.031, 0.031, 0.031, 0.094, 0.031, 0.031, 0.063, 0.155, 0.094, 0.094, 0.063, 0.063, 0.031, 0.063]])

table_norm = np.divide(table, natural_frequency)
print(table_norm)

In [None]:
#optional plot heatmap at the level of normalization.
heatmap_norm = heat.heatmap(title=HEATMAP_TITLE + " normalized",data=table_norm,y_labels=Rows,x_labels=columns)
heat.heatmap.draw(heatmap_norm)

In [None]:
#standardization
table_standardized = (table_norm - np.mean(table_norm)) / np.std(table_norm)
print(table_standardized)
table_standardized = pd.DataFrame(table_standardized)
table_standardized.columns = columns
pd.DataFrame.to_csv(table_standardized, OUTPUT_FILE_NAME)

#heatmap generation
heatmap1std = heat.heatmap(title=HEATMAP_TITLE + " normalized and standardized",data=table_standardized,y_labels=Rows,x_labels=columns)
heat.heatmap.draw(heatmap1std)