# Creating matrices representing the difference in branch A and B

#### done before (s. 01_Trajectories_processing): 
Dataframes as csv for all windows of the two branches with each entry being a transcription factor, a list of genes and the corresponding list of weights representing the interaction between the tf and the genes. This gives a network of tfs, genes and connections for each branch in each window. Next we will look for interactions that strongly vary in branch A and branch B by taking the absolute values between the weights.

In [2]:
PATH_TO_DATAFRAMES='data_files/dataframes/'
PATH_TO_ABS_MATRICES='data_files/abs/'
PATH_TO_ABS_MATRICES_WO_0='data_files/abs_wo0/' # removing all entries with difference 0 from the dataframes
NUM_WINDOWS=5  # check with before

In [3]:
import pandas as pd
import glob
import os

In [4]:
# for all windows

In [5]:
branchA_csvs=glob.glob(f"{PATH_TO_DATAFRAMES}A_*")
branchA_csvs=[os.path.basename(csv) for csv in branchA_csvs]

branchB_csvs=glob.glob(f"{PATH_TO_DATAFRAMES}B_*.csv")
branchB_csvs=[os.path.basename(csv) for csv in branchB_csvs]

Create those matrices with containing absolute values 0, which makes the result files very large. Below is a version where those entries are ignored.

In [6]:
for k in range(NUM_WINDOWS):
    print(f'current window: {k}')
    # get the files for the current window
    for branchA_csv in branchA_csvs:
        if (int)(branchA_csv.split('_')[1])==k:
            current_A=branchA_csv
            df_a = pd.read_csv(f"{PATH_TO_DATAFRAMES}{current_A}")  
            break
        else: 
            current_A=None
            df_a=pd.DataFrame(columns=['TF', 'Genes', 'Weights'])
    
    for branchB_csv in branchB_csvs:
        if (int)(branchB_csv.split('_')[1])==k:
            current_B=branchB_csv
            df_b = pd.read_csv(f"{PATH_TO_DATAFRAMES}{current_B}")
            break
        else:
            current_B=None
            df_b=pd.DataFrame(columns=['TF', 'Genes', 'Weights'])
            
    # continue with next window if both branches have no entries in this window  
    if current_A is None and current_B is None:
        continue
          
    
    # get the tfs for the current window
    tfs = set(df_a['TF'])
    tfs = tfs.union(set(df_b['TF']))
    
    # get the genes for the current window
    genes = set()
    

    # make sure the genes are saved as a list
    for i in range(len(df_a['TF'])):
        curr_genes = df_a['Genes'][i].split("'")
        curr_set = set(curr_genes)
        genes = genes.union(curr_set)
    for i in range(len(df_b['TF'])):
        curr_genes = df_b['Genes'][i].split("'")
        curr_set = set(curr_genes)
        genes = genes.union(curr_set)
    # remove nonsense from list
    genes.remove('[')
    genes.remove(']')
    genes.remove(', ')
    
    
    # define tfs, genes and weights for current a and b
    a_tfs = list(df_a['TF'])
    a_genes = list(df_a['Genes'])
    a_weights = list(df_a['Weights'])
    b_tfs = list(df_b['TF'])
    b_genes = list(df_b['Genes'])
    b_weights = list(df_b['Weights'])
    
    
    # insert a null value for the first column
    genes = sorted(genes) # alphabtical order
    genes.insert(0,'NULL') # as column for TFs
    
    
    # create adjacency matrix for a and b
    a_rows = []
    a_rows.append(genes)
    b_rows = []
    b_rows.append(genes)
    for tf in tfs:
        adjacencies = [tf]
        
        # read weights from dataframe for branch A and add 0.0 if connection between tf and gene does not exist here
        if tf in a_tfs:
            tf_index = a_tfs.index(tf)

            curr_genes = a_genes[tf_index].split("'")
            curr_genes.remove('[')
            curr_genes.remove(']')
            while ', ' in curr_genes:
                curr_genes.remove(', ')

            for gene in genes: 
                weight = 0.0
                if gene in curr_genes:
                    gene_index = curr_genes.index(gene)
                    curr_weights = a_weights[tf_index].split(",")    
                    weight = curr_weights[gene_index]
                    # strip [ or ] from number if 1st or last weight
                    if (gene_index == len(curr_genes)-1):
                        weight = weight.split("]")[0]
                    elif gene_index == 0:
                        weight = weight.split("[")[1]
                    weight = float(weight)

                adjacencies.append(weight)
        else:
            for gene in genes:
                adjacencies.append(0.0)
        a_rows.append(adjacencies) 
    
        # read weights from dataframe for branch B and add 0.0 if connection between tf and gene does not exist here
        adjacencies = [tf]
        if tf in b_tfs:
            tf_index = b_tfs.index(tf)

            curr_genes = b_genes[tf_index].split("'")
            curr_genes.remove('[')
            curr_genes.remove(']')
            while ', ' in curr_genes:
                curr_genes.remove(', ')

            for gene in genes:
                weight=0.0
                if gene in curr_genes:
                    gene_index = curr_genes.index(gene)
                    curr_weights = b_weights[tf_index].split(",")               
                    weight = curr_weights[gene_index]
                    # strip [ or ] from number if 1st or last weight
                    if (gene_index == len(curr_genes)-1):
                        weight = weight.split("]")[0]
                    elif gene_index == 0:
                        weight = weight.split("[")[1]
                    weight = float(weight)
                    
                adjacencies.append(weight)
        else:
            for gene in genes:
                adjacencies.append(0.0)
        b_rows.append(adjacencies)    
        
        
    # build the matrix with the absolut values for each window
    abs_rows = []
    #abs_rows.append(genes)

    for i in range(1,len(a_rows)):
        for j in range(1, len(genes)):
            row = []
            row.append(a_rows[i][0])
            row.append(genes[j])
            value = abs(a_rows[i][j+1]-b_rows[i][j+1])
            row.append(value)
            abs_rows.append(row)
    
    # save the results
    win_dataframe = pd.DataFrame(abs_rows)
    win_dataframe.to_csv(f"{PATH_TO_ABS_MATRICES}win{k}_abs.csv", header = ['TF', 'GENE', 'abs'], index=False)
    
    


current window: 0
current window: 1
current window: 2
current window: 3
current window: 4


Better version: Create those matrices but leave out absolute values that are 0. 

In [10]:
PATH_TO_WINDOW_WEIGHTS = 'data_files/windows_weights/'

In [18]:
for k in range(NUM_WINDOWS):
    print(f'current window: {k}')
    # get the files for the current window
    for branchA_csv in branchA_csvs:
        if (int)(branchA_csv.split('_')[1])==k:
            current_A=branchA_csv
            df_a = pd.read_csv(f"{PATH_TO_DATAFRAMES}{current_A}")  
            break
        else: 
            current_A=None
            df_a=pd.DataFrame(columns=['TF', 'Genes', 'Weights'])
    
    for branchB_csv in branchB_csvs:
        if (int)(branchB_csv.split('_')[1])==k:
            current_B=branchB_csv
            df_b = pd.read_csv(f"{PATH_TO_DATAFRAMES}{current_B}")
            break
        else:
            current_B=None
            df_b=pd.DataFrame(columns=['TF', 'Genes', 'Weights'])
            
    # continue with next window if both branches have no entries in this window  
    if current_A is None and current_B is None:
        continue
          
    
    # get the tfs for the current window
    tfs = set(df_a['TF'])
    tfs = tfs.union(set(df_b['TF']))
    
    # get the genes for the current window
    genes = set()
    

    # make sure the genes are saved as a list
    for i in range(len(df_a['TF'])):
        curr_genes = df_a['Genes'][i].split("'")
        curr_set = set(curr_genes)
        genes = genes.union(curr_set)
    for i in range(len(df_b['TF'])):
        curr_genes = df_b['Genes'][i].split("'")
        curr_set = set(curr_genes)
        genes = genes.union(curr_set)
    # remove nonsense from list
    genes.remove('[')
    genes.remove(']')
    genes.remove(', ')
    
    
    # define tfs, genes and weights for current a and b
    a_tfs = list(df_a['TF'])
    a_genes = list(df_a['Genes'])
    a_weights = list(df_a['Weights'])
    b_tfs = list(df_b['TF'])
    b_genes = list(df_b['Genes'])
    b_weights = list(df_b['Weights'])
    
    
    # insert a null value for the first column
    genes = sorted(genes) # alphabtical order
    genes.insert(0,'NULL') # as column for TFs
    
    
    # create adjacency matrix for a and b
    a_rows = []
    a_rows.append(genes)
    b_rows = []
    b_rows.append(genes)
    for tf in tfs:
        adjacencies = [tf]
        
        # for a
        if tf in a_tfs:
            tf_index = a_tfs.index(tf)

            curr_genes = a_genes[tf_index].split("'")
            curr_genes.remove('[')
            curr_genes.remove(']')
            while ', ' in curr_genes:
                curr_genes.remove(', ')

            for gene in genes: 
                weight = 0.0
                if gene in curr_genes:
                    gene_index = curr_genes.index(gene)
                    curr_weights = a_weights[tf_index].split(",")    
                    weight = curr_weights[gene_index]
                    # strip [ or ] from number if 1st or last weight
                    if (gene_index == len(curr_genes)-1):
                        weight = weight.split("]")[0]
                    elif gene_index == 0:
                        weight = weight.split("[")[1]
                    weight = float(weight)

                adjacencies.append(weight)
        else:
            for gene in genes:
                adjacencies.append(0.0)
        a_rows.append(adjacencies) 
        
        # for b
        adjacencies = [tf]
        if tf in b_tfs:
            tf_index = b_tfs.index(tf)

            curr_genes = b_genes[tf_index].split("'")
            curr_genes.remove('[')
            curr_genes.remove(']')
            while ', ' in curr_genes:
                curr_genes.remove(', ')

            for gene in genes:
                weight=0.0
                if gene in curr_genes:
                    gene_index = curr_genes.index(gene)
                    curr_weights = b_weights[tf_index].split(",")               
                    weight = curr_weights[gene_index]
                    # strip [ or ] from number if 1st or last weight
                    if (gene_index == len(curr_genes)-1):
                        weight = weight.split("]")[0]
                    elif gene_index == 0:
                        weight = weight.split("[")[1]
                    weight = float(weight)
                    
                adjacencies.append(weight)
        else:
            for gene in genes:
                adjacencies.append(0.0)
        b_rows.append(adjacencies)    
        
        
    # build the matrix with the absolut values for each window
    abs_rows = []
    
    win_a_rows = []
    win_b_rows = []
    #abs_rows.append(genes)

    for i in range(1,len(a_rows)):
        for j in range(1, len(genes)):
            row = []
            row.append(a_rows[i][0])
            row.append(genes[j])
            value = abs(a_rows[i][j+1]-b_rows[i][j+1])
            row.append(value)
            
            # here is the difference! Only entries where the abs-value is not 0 are saved
            if (value != 0.0):
                abs_rows.append(row)
            
            win_a_row = []
            win_b_row = []
            a_weight = a_rows[i][j+1]
            b_weight = b_rows[i][j+1]
            if a_weight != 0.0:
                win_a_row =[a_rows[i][0], genes[j], a_weight]
                win_a_rows.append(win_a_row)
            if b_weight != 0.0:
                win_b_row =[b_rows[i][0], genes[j], b_weight]
                win_b_rows.append(win_b_row)
    
    # save the results
    win_dataframe = pd.DataFrame(abs_rows)
    win_dataframe.to_csv(f"{PATH_TO_ABS_MATRICES_WO0}win{k}_abs.csv", header = ['TF', 'GENE', 'abs'], index=False)
    
    # save the weights in a different file for better readability (TF, GENE, weight) and not as lists of genes and weights
    a_win_df = pd.DataFrame(win_a_rows)
    b_win_df = pd.DataFrame(win_b_rows)
    a_win_df.to_csv(f"{PATH_TO_WINDOW_WEIGHTS}A_win{k}_weights.csv", header = ['TF', 'GENE', 'weight'], index=False)
    b_win_df.to_csv(f"{PATH_TO_WINDOW_WEIGHTS}B_win{k}_weights.csv", header = ['TF', 'GENE', 'weight'], index=False)
    


current window: 0
current window: 1
current window: 2
current window: 3
current window: 4


#### done: 
Matrices with absolute vaules for each window, representing the difference between the two branches. The larger the abs value the larger the difference in expression between branch A and B.