In [1]:
# coding: utf-8
import ast
import os
import sys
import re
from pprint import pprint
import numpy as np
from tqdm import tqdm

from func import normalization as norm

##############################################
# JUPYTER-NOTEBOOK FOR NORMALIZING BARCODE   #
# COUNTS FROM BFG-PCA SCREENINGS             #
# Last modified by Daniel Evans-Yamamoto     #
##############################################

PATH = os.path.abspath(".")
run_name = "test"
norm_dir = '%s/Data/%s/normalization'%(PATH,run_name)
stats_dir = '%s/Data/%s/stats'%(PATH,run_name)
if not os.path.isdir(norm_dir):
    os.makedirs(norm_dir)
    os.makedirs(stats_dir)

In [2]:
#  Definfing the file location
count_f        = "%s/Data/%s/barcode_counts/counts.txt"%(PATH,run_name)

#  Reading dict object from the text files with count data
count     = norm.reading(count_f)
# Reading tag information and barcode database 
multiplex_tag  = norm.get_tagdata('%s/%s/%s_tag.csv'%(PATH,run_name,run_name))#Change this to your own file. Read wiki for instructions.
db             = norm.get_mapdata('%s/%s/%s_database.csv'%(PATH,run_name,run_name))#Change this to your own file. Read wiki for instructions.

In [3]:
# Examine total reads;
norm.tot_reads(count,multiplex_tag)

Index Sample Total reads
P01-P01 DMSO_72 26688
P02-P02 DMSO_96 20268
P03-P03 MTX1_72 31386
P04-P04 MTX1_96 17517
P05-P05 MTX10_72 59666
P06-P06 MTX10_96 48641
P07-P07 MTX100_72 80752
P08-P08 MTX100_96 70757
P09-P09 MTX200_72 63668
P10-P10 MTX200_96 68431


In [4]:
#pprint(count)

In [5]:
# Organize dict of reads per strain
count_2 =  norm.organize_data(count,db,multiplex_tag,"PCA")

Working on ...
P01-P01 Control_1
P03-P03 MTX1_72_PRS
P05-P05 MTX10_72_PRS
P07-P07 MTX100_72_PRS
P09-P09 MTX200_72_PRS
P02-P02 Control_2
P04-P04 MTX1_96_PRS
P06-P06 MTX10_96_PRS
P08-P08 MTX100_96_PRS
P10-P10 MTX200_96_PRS


In [6]:
# Eliminating strains present in less than 90% of diploids

count_3 = norm.absent(count_2,0.9,"PCA")

count_sums = norm.get_sums(count_3,multiplex_tag,"PCA",db)




In [7]:
# Normalize scores based on strain abundance of bait and prey haploids in Control condition

raw_s,hap_s  = norm.compute_s(count_3,count_sums,db,multiplex_tag,"PCA")

In [8]:
# Organizing data and counting haploid strains
reps= norm.count_haploids(hap_s,norm_dir,db)
norm.haploid_replicates(reps,norm_dir)

Control_1 		Bait: , (15 ORFs) 21 Prey, (13 ORFs) 18
Control_2 		Bait: , (15 ORFs) 21 Prey, (13 ORFs) 18
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/Haploid_F.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/haplliod_n.csv


In [9]:
## Normalize signal 's'
## For Y2H ; Normalize data by Nth quantile of bait 'median distribution' of 's' score
## For PCA ; Normalize data by median 's' score of both bait and prey 

normalized_score,hap_s = norm.compute_ds(raw_s,count_sums,db,multiplex_tag,"PCA")


In [10]:
norm.output_norm_score(normalized_score,db,norm_dir)

100%|██████████| 15/15 [00:00<00:00, 208.57it/s]

Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/BFG-PCA_PPI_scores_Normalized.csv





In [11]:
#pprint(hap_s)

In [12]:
## Outping stats as csv in 3_Stats_for_plot
## List of files to output
# 1. strain_abundance_control.csv   [Method,Condtion,Replicate,bait,bait_ORF,prey,prey_ORF,Raw,F
# 2. normalized_scores           =  [['Method','Condtion','Replicate','bait','bait_ORF','prey','prey_ORF','s','ds']]
# 3. autoactivity_level.csv [Method,Strain_type,Conditon,AA_median,AA_median_rank,Strain]
# 4. bfg_corr.csv           [Method,Conditon,score_type,UpUp,DnDn]
# 5. diploid_corr.csv       [Method,Conditon,score_type,Diploid1,Diploid2]
# 6. screening_rep_corr.csv [Method,Conditon,score_type,Rep1,Rep2]
# 7. ORF_ori_corr.csv       [Method,Conditon,score_type,bait_Prey,Prey_Bait] #Average interanal replicates + screeening replicates
# 8. screening_meth_corr.csv[score_type,CondA,CondB,ConditonA_score,ConditonB_score] #Average interanal replicates+ screeening replicates

data =  norm.reformat(normalized_score)
norm.output_stats(raw_s,hap_s,db,norm_dir,"PCA")

Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/heatmap_F/F_heatmap_Control_1.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX1_72.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX10_72.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX100_72.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX200_72.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/heatmap_F/F_heatmap_Control_2.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX1_96.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX10_96.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX100_96.csv
Generated : /Users/danyamamotoevans/GitHub/BFG-PCA/Data/test/normalization/F_heatmap_MTX200_96.

In [13]:
# Information of each csv will be shown in the BFG-PCA wiki (https://github.com/DanYamamotoEvans/BFG-PCA/wiki)
# I will soon update with scripts to vidialize the output.

print("Proceed to BFG_performance_measure.ipynb to assess the performance of the screening.")

Proceed to BFG_performance_measure.ipynb to assess the performance of the screening.
