In [1]:
import copy as cp
import itertools as itt
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os
import pandas as pd
import pickle
import scipy.optimize as spo
import scipy.special as spsp
import scipy.stats as sps
import seaborn as sns

from matplotlib.ticker import FixedLocator

mpl.rcParams['axes.titlesize'] = 'xx-large'
mpl.rcParams['axes.labelsize'] = 'xx-large'
mpl.rcParams['xtick.labelsize'] = 'x-large'
mpl.rcParams['ytick.labelsize'] = 'x-large'
mpl.rcParams['xtick.direction'] = 'out'
mpl.rcParams['ytick.direction'] = 'out'
mpl.rcParams['legend.frameon'] = True
mpl.rcParams['legend.framealpha'] = 0.5
mpl.rcParams['legend.fontsize'] = 'large'

#Project's path
path_proj = os.getcwd()+ '/'
print(path_proj)

#Project's path for file from single cell data after culture
path_sc = path_proj+'csv/Single_cell/'

#Project's path for file from sort data 
path_sort = path_proj+'csv/Sort/'


/Users/quanti/Desktop/M_HSC/


In [3]:
#(USER-DEFINED) cell_cols is a dictionary that assign to each cell type a color for the plots.
#(USER-DEFINED) cell_class_exp_time is function that determines a cell's class upon its markers,
#given user defined thresholds from Gating_matrix, which is imported in the next cell.

cell_cols = {'CD48- SLAM-HSC':'Red', 'CD48lo SLAM-HSC':'#ff9400', 'ST-HSC':'#deff19', 'MPP':'Limegreen',
             'SLAM+ Flt3+':'seagreen', 'GMP':'aqua', 'MEP':'Deepskyblue', 'SLAM+ MEP':'Royalblue',
             'MP':'DarkViolet', 'CD16/32- ckit-':'Violet'}

def cell_class_exp_time(df, thr_cd48, thr_slam, thr_slam_sca1Neg, thr_flt3,
                        thr_sca1, thr_ckitLo, thr_ckitHi, thr_cd1632):
    if df['ckit'] >= thr_ckitLo and df['SCA-1'] >= thr_sca1:
        if df['Flt3'] < thr_flt3:
            if df['SLAM'] >= thr_slam:
                if thr_cd48==-1:
                    return {'class':'SLAM-HSC', 'color':cell_cols['CD48- SLAM-HSC'], 'rank':0}
                else:
                    if df['CD48'] < thr_cd48:
                        return {'class':'CD48- SLAM-HSC', 'color':cell_cols['CD48- SLAM-HSC'], 'rank':0}
                    else:
                        return {'class':'CD48lo SLAM-HSC', 'color':cell_cols['CD48lo SLAM-HSC'], 'rank':1}
            else:
                return {'class':'ST-HSC', 'color':cell_cols['ST-HSC'], 'rank':2}
        else:
            if df['SLAM'] < thr_slam:
                return {'class':'MPP', 'color':cell_cols['MPP'], 'rank':3}
            else:
                return {'class':'SLAM+ Flt3+', 'color':cell_cols['SLAM+ Flt3+'], 'rank':4}
    elif df['ckit'] >= thr_ckitHi and df['SCA-1'] < thr_sca1:
        if df['CD16/32'] >= thr_cd1632:
            return {'class':'GMP', 'color':cell_cols['GMP'], 'rank':5}
        else:
            if df['SLAM'] < thr_slam_sca1Neg:
                return {'class':'MEP', 'color':cell_cols['MEP'], 'rank':6}
            else:
                return {'class':'SLAM+ MEP', 'color':cell_cols['SLAM+ MEP'], 'rank':7}
    else: 
        if df['CD16/32'] >= thr_cd1632:
            return {'class':'MP', 'color':cell_cols['MP'], 'rank':8}
        else:
            return {'class':'CD16/32- ckit-', 'color':cell_cols['CD16/32- ckit-'], 'rank':9}
    return {'class':'NC', 'color':'Black', 'rank':100}


def cell_class(df):
    if df.Experiment+'_'+df.Culture_time in dct_thr.keys():
        return cell_class_exp_time(df, *(dct_thr[df.Experiment+'_'+df.Culture_time]))
    else:
        return {'class':'Experiment_or_time_not_found', 'color':'Black', 'rank':100}

    
#(EXPERIMENT SPECIFIC) conds and cond_rule are used later to rename the culture conditions
conds = ['P', 'P+ILs']
def cond_rule(cond):
    if cond == 'SC2':
        return conds[0]#'Proliferation'
    elif cond == 'SC3':
        return conds[1]#'Proliferation+Differentiation'
    else:
        return 'NA'

#DETERMINES AND PRINTS FAMILLIES IN df (DataFrame OF DATA) THAT ARE PHYSICALLY IMPOSSIBLE
#GIVEN 2 CELLS ARISE, IN THE NEXT GENERATION, FROM ONE DIVIDING CELL
def find_impossible_families(df):
    fams = np.unique(df.Family)
    lst_fam_vec = [np.unique(df[df.Family==fam].Generation, return_counts=True) for fam in fams]
    cohort_number = np.array([(el[1]/np.power(2.,el[0])).sum() for el in lst_fam_vec])
    if all(cohort_number<=1):
        print('No impossible families detected')
        return []
    else:
        impossible_families = fams[cohort_number>1]
        print('Impossible families:', impossible_families)
        return impossible_families

In [4]:
#IMPORT Gating_matrix
df_gating = pd.read_excel(io=path_proj+'Gating_matrix.xlsx', index_col=0)

#GLOBAL VARIABLE FOR cell_class FUNCTION
dct_thr = {exp_time:df_gating[exp_time].values for exp_time in df_gating.columns}

In [6]:
#IMPORT SORT and SINGLE-CELL (AFTER CULTURE) DATA
lst_sc_files = [file for file in os.listdir(path_sc) if '.csv' in file and file!='Pooled_data.csv']
lst_sort_files = [file for file in os.listdir(path_sort) if '.csv' in file and file!='Pooled_data.csv']
print('There a sort file for every sc', all([file in lst_sort_files for file in lst_sc_files]))

df_sc_lst = []
df_sort_lst = []
file_name = lst_sc_files[1]
for file_name in lst_sc_files:
    print(file_name)
    
    #(EXPERIMENT SPECIFIC) as the filenames here ends with '.csv'
    experiment_name = file_name[:4]
    
    #(EXPERIMENT SPECIFIC) sep=';', decimal=',' are unusual
    df = pd.read_csv(path_sc+file_name, sep=';', decimal=',', header=0)
    dfi = pd.read_csv(path_sort+file_name, sep=';', decimal=',', header=0)

    #HERE THE USER SHOULD DECIDE HOW TO FORMAT THE IMPORTED DATA
    df.loc[df.Original_cell=='HSC',['Original_cell']] = 'SLAM-HSC'
    df = df.assign(
        Culture_condition=df.apply(lambda r: cond_rule(r.Condition), axis=1),
        Experiment=[experiment_name for k in range(len(df))]
    )
    df = df.assign(
        Well_experiment=lambda r:r.Well+'_'+r.Experiment,
        Family=lambda r: r.Well + r.Color + r.Culture_time + r.Original_cell + r.Experiment,
        Class=df.apply(func=lambda r:cell_class(r)['class'], axis=1),
        Cell_color=df.apply(func=lambda r:cell_class(r)['color'], axis=1),
        Cell_rank=df.apply(func=lambda r:cell_class(r)['rank'], axis=1)
    )

    dfi.loc[dfi.Original_cell=='HSC',['Original_cell']] = 'SLAM-HSC'
    dfi = dfi.assign(
        Culture_condition=dfi.apply(lambda r: cond_rule(r.Condition), axis=1),
        Experiment=[experiment_name for k in range(len(dfi))]
    )
    dfi = dfi.assign(
        Well_experiment=lambda r:r.Well+'_'+r.Experiment,
        Family=lambda r: r.Well + r.Color + r.Culture_time + r.Original_cell + r.Experiment,
    )

    #DETECT AND REMOVE IMPOSSIBLE FAMILIES
    impossible_fams = find_impossible_families(df)
    df = df[~(df.Family.isin(impossible_fams))]
    dfi = dfi[~(dfi.Family.isin(impossible_fams))]
    df_sc_lst.append(df)
    df_sort_lst.append(dfi)
    
df_pool = pd.concat(df_sc_lst, ignore_index=True)
dfi_pool = pd.concat(df_sort_lst, ignore_index=True)

#EXPORT PROCESSED AND POOLED DATA
df_pool.to_csv(path_sc+'Pooled_data.csv', sep=';', decimal=',', index=False)
dfi_pool.to_csv(path_sort+'Pooled_data.csv', sep=';', decimal=',', index=False)

There a sort file for every sc True
TT23.csv
Impossible families: ['A7CF24hMPPTT23' 'A7Vi24hMPPTT23' 'C7Vi24hMPPTT23' 'E7VC24hMPPTT23'
 'F1Vi24hMPPTT23' 'G11CV24hSLAM-HSCTT23' 'H12VC24hSLAM-HSCTT23'
 'H2CV24hMPPTT23' 'H8CV24hMPPTT23']
TT22.csv
Impossible families: ['A1Vi24hMPPTT22' 'B4Vi24hST-HSCTT22' 'C10Vi24hST-HSCTT22'
 'E12Vi48hSLAM-HSCTT22' 'E5VC24hSLAM-HSCTT22' 'F4Vi24hST-HSCTT22'
 'G5Vi48hSLAM-HSCTT22' 'G6Vi48hSLAM-HSCTT22']
TT21.csv
Impossible families: ['A4Vi24hST-HSCTT21' 'A8Vi24hMPPTT21' 'B5CV48hSLAM-HSCTT21'
 'C7CF24hMPPTT21' 'D7Vi48hMPPTT21' 'E11Vi24hSLAM-HSCTT21'
 'E3Vi24hST-HSCTT21' 'E5Vi24hSLAM-HSCTT21' 'G11Vi48hSLAM-HSCTT21'
 'H8CF24hMPPTT21']
