In [None]:
# coding: utf-8
import ast
import os
import sys
import re
from pprint import pprint
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt


from func import simulation as sim

##############################################
# Monte-Carlo simulation of BFG-Y2H screen   #
# Initially developped by Nozomu Yachie      #
# Last modified by Daniel Evans-Yamamoto     #
##############################################

run_name = 'test'

In [None]:
# Define teh number of strains here;
x_strain_num = 500
y_strain_num = 500

In [None]:
# Setting haploid population
x_hap = [["x_%4d"%(i+1),1] for i in range(x_strain_num)]
y_hap = [["y_%4d"%(i+1),1] for i in range(y_strain_num)]

In [None]:
hap_size = 10 * 3*10**7; # 1 OD unit of haploid cells = 3 x 10^7 


# Adjusting the cell number of each strain in pool.
x_hap = sim.adjust_num(x_hap,0.3,hap_size)
y_hap = sim.adjust_num(y_hap,0.3,hap_size)
    
cov = sim.hap_coverage(x_hap,y_hap,run_name)


In [None]:
print("The ratio of strains having at lesat the number of cells above threshold.")
print("Cells per haploid pool: %d OD600nm units"%(hap_size/(3*10**7)))
sim.print_LL(cov)

In [None]:
# Simulating diploid cells after en masse yeast mating subjected for diploid selection

mating_efficacy   = 0.01 # This is underestimation to be safe.
diploid_selection = 500 * 3 * 10**7 # Amount of cells to be selected for diploids.
est_diploids      = diploid_selection*mating_efficacy

dip = sim.mating(x_hap,y_hap)
dip = sim.adjust_num(dip,0.5,est_diploids)#CV of X-Y pair-dependent mating efficiencies (log-normal distribution) is 50%
cov = sim.dip_coverage(dip,run_name)


print("The ratio of diploid strains having at least the number of cells above threshold.")
sim.print_LL(cov)

#If you want to estimate the optimal paramater for coverage, create a for loop with the variable, and plot a line plot.

In [None]:
###############################################################
# Diploid selection and amount of cells spread per plate


# Amount of cells to be spread to each selection plate
dip_plate = 50 * 1.0 * 10**7 #1 OD(600 nm) unit for diploid yeast is 1 × 10^7 cells ml^-1
###############################################################################################################
### !!!! After observing CFUs on the plates, update dip_plate to check if the CFU is enough down stream !!  ###
###############################################################################################################

dip = sim.adjust_num(dip,0.5,dip_plate) #CV of X-Y pair-dependent growth amplitudes (log-normal distribution) in liquid media is 50%


In [None]:
#pprint(dip)

In [None]:
cells_for_dox_induction = 25 * 1.0 * 10**7
dip_neg = sim.adjust_num(dip,0.1,cells_for_dox_induction) #CV of X-Y pair-dependent growth amplitudes (log-normal distribution) in non-selectable condition is 10%

pos_rate  = 0.001 #PPI ositive rate is 0.1% (overestimation)
CV_autoactivity = 0.1 # Log CV for autoactivity in BFG-PCA is 10% for each of DHFR12 and DHFR3. It was 300% for DB and 60% for AD in BFG-Y2H. DB
dip_pos = sim.positive_interaction(dip,0.10,CV_autoactivity,pos_rate,cells_for_dox_induction) #CV of X-Y pair-dependent growth amplitudes (log-normal distribution) in BFG-PCA-selectable condition is 10%



In [None]:
# Examine the coverage for diploids 
cov_pos = sim.dip_coverage(dip_pos,run_name)
cov_pos_ppi = sim.dip_coverage([strain for strain in dip_pos if (strain[0][2]==1)],run_name)
cov_neg = sim.dip_coverage(dip_neg,run_name)

print("The ratio of diploid strains having at least the number of cells carried over to next step above threshold.")
print("\nNon-selection:")
sim.print_LL(cov_neg)

print("\nSelection :")
sim.print_LL(cov_pos)

print("\nCoverage of strains with positives PPIs in selectable condition")
sim.print_LL(cov_pos_ppi)

In [None]:
# You will be able to plot a heatmap of abundance at this stage.

In [None]:
###################################################
# Plasmid prep


plasmid_molecules = 1.4 * 10**7 #See below for estimation

###################################################
# Molecule estimation when performing 4 PCR reactions after plasmid prep;
#
# 100 pg / µL * 16  µL * 4 reactions = 6400 pg
# Fraction of Y2H plasmids in yeast DNA miniprep product is 6% of the total DNA mass
# Barcode fusion efficiency is 20%
# 6400 pg * 0.06 * 0.2  = 152.6 pg
# Y2H plasmid sizes are 10 kbp
# 1 mole of double-stranded DNA molecule is 660 g bp^-1 (1ng = 0.152 fmol)
# 153.6 * (1/1000 * 0.152) fmol  = 153.6 * 10**-3 * 0.152 * 10**-15 * 6.03* 10**23
#                                = 153.6 * 6.03 * 0.152 * 10**5
#                                = 1.4 * 10**7 molecules
######################################################


plasmid_neg = sim.adjust_num(dip_neg,0.5,plasmid_molecules)
plasmid_pos = sim.adjust_num(dip_pos,0.5,plasmid_molecules)


In [None]:
###############################################################
# PCR re-amplification and Illumina sequencing

reads_per_condition = 2* 10**6 # Reads for each condition (add Barcode Fusion type (BC1-BC1 or BC2-BC2) in each condition.)


read_neg = sim.adjust_num(plasmid_neg,0.5,reads_per_condition)
read_pos = sim.adjust_num(plasmid_pos,0.5,reads_per_condition)
read_pos_PPI = sim.adjust_num([strain for strain in read_pos if (strain[0][2]==1)],0.5,reads_per_condition)



In [None]:
# Examining coverage at final output
cov_pos = sim.dip_coverage(read_pos,run_name)
cov_pos_ppi = sim.dip_coverage([strain for strain in read_pos if (strain[0][2]==1)],run_name)
cov_neg = sim.dip_coverage(read_neg,run_name)

print("The ratio of diploid strains having at least the number of reads sequenced.")
print("\nNon-selection:")
sim.print_LL(cov_neg)

print("\nSelection :")
sim.print_LL(cov_pos)

print("\nCoverage of strains with positives PPIs in selectable condition")
sim.print_LL(cov_pos_ppi)

In [None]:
# Some data formatting functions to enable data represenation

# Haploid abundance histogram
marginal_x_hap,marginal_y_hap = sim.marginal(read_neg)
# Diploid abundance histogram
abundance_pos                 = sim.abundance(read_neg)
abundance_neg                 = sim.abundance(read_pos)

# Function to format array data for heatmap representation in each condition 
hmap_read_neg = sim.format_array(read_neg)