In [1]:
import os.path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances

import re #regular expression matching for removing unwanted columns by name 
import natsort as ns #3rd party package for natural sorting 

In [2]:
def raw_data_cleanup(filename):
    
    """
    Imports RNAseq .csv file and does basic clean up of "FM40" 
        -sorts FM40 timecourse sequence chronologically
        -removes all QC data and non FM40 columns
        -returns dataframe with locus tag set as index
    """

    if os.path.isfile(filename):
        print("{} was located in the directory".format(filename))
                
#import the data
        data0_raw = pd.read_csv(filename, sep = "\t") 
        print("{} was imported into dataframe".format(filename))
        
#removing all QC data
        data1_noQC = data0_raw.select(lambda x: not re.search("QC", x), axis = 1) 
        print("QC columns were removed from dataframe")

#removing all non FM40 data
        data2_FM40only = data1_noQC.select(lambda x: re.search("FM40", x), axis = 1)
        print("All non FM40 data were removed from dataframe")
        
#naturally sorting FM40 data by columns 
        cols = list(ns.natsorted(data2_FM40only.columns))
        data3_sorted=data2_FM40only[cols]
        print("All FM40 columns were sorted by timecourse sequence")
        
#adding the descriptor columns back to FM40
        qualitative = data0_raw.loc[:,"locus_tag":"translation"]
        data4_sorted = pd.concat([qualitative, data3_sorted], axis = 1)

#setting locus tag to be the index
        data5_index = data4_sorted.set_index("locus_tag")
        
    
        print("Clean-up of raw data complete")
        return data5_index
        
    else:
        print("{} does not exist in directory. Function was not complete.".format(filename))
        return
    

### Calculating TPM counts 

In [3]:
def TPM_counts(dataframe, 
              gene_start,
              gene_stop,
              columns):
    
    """
    TPM_counts(dataframe, gene_start, gene_stop, columns):

    returns a dataframe with TPM instead of reads

    Parameters
    ----------
    daraframe = dataframe object variable
    gene_start = string with column name containing gene start coordinate
    gene_stop = string with column name containing gene stop coordinate
    columns = list of strings of column names to be converted to TPM


    Run the following two lines to properly execute this function:

    columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

    TPM_counts(df,"start_coord","end_coord",columns)
    """
    
    #create empty dataframe 
    gene_length = pd.DataFrame()
    
    #gene length in kilo base pairs as new column
    gene_length["gene_length"] = (dataframe[gene_stop]- dataframe[gene_start] + 1)/1000   
    
    #normalize read counts by gene length in kilo base pairs
    RPK = dataframe.loc[:,columns].div(gene_length.gene_length, axis=0) 
    
    #creating a series with the sums of each FM40 column / 1,000,000
    norm_sum = RPK.sum(axis=0)/1000000 
    norm_sum1 = pd.Series.to_frame(norm_sum)
    norm_sum2 = norm_sum1.T
    
    #dividing by the the total transcript counts in each repicate
    TPM = RPK.div(norm_sum2.ix[0]) 
    
    dataframe.loc[:,columns] = TPM
    
    return dataframe 


### Log2 fold transfrom of the TPM data.

In [4]:
def log_2_transform(dataframe,
                    first_data_column,
                    last_data_column):
                  
    """
    log_2_transform(dataframe, 
                    first_data_column, 
                    last_data_column)
    
    Return a new dataframe with the range of data columns log2 transformed. 
    *all zero values are changed to 1 (yield 0 after transform)
    *all values less than 1 are changed to 1 (yield 0 after transform)
    
    Parameters
    ----------
    daraframe = dataframe object variable
    first_data_column = first column that contains actual data (first non categorical)
    last_data_column = last column taht contains actual data (last non categorigal column)

    Run the following to execute the function for Cu transition dataset. 

    log_2_transform(df, "5GB1_FM40_T0m_TR2", "5GB1_FM40_T180m_TR1") 
    
    """
    
    df_data = dataframe.loc[:,first_data_column:last_data_column] #isolate the data
    
    df_data = df_data.replace(0,1) #replace all zeros with 1s
    
    df_data[df_data<1] = 1 #replace all values less than 1 with 1
    
    df_data_log2 = df_data.apply(np.log2)
    
    return df_data_log2

### Mean centering the TPM data! 

In [5]:
def mean_center(df, first_data_column, last_data_column):       
        
        
    """
    mean_center(dataframe, 
                first_data_column, 
                last_data_column)
    
    Return a new dataframe with the range of data columns log2 transformed. 
    
    Parameters
    ----------
    daraframe = dataframe object variable
    first_data_column = first column that contains actual data (first non categorical)
    last_data_column = last column taht contains actual data (last non categorigal column)

    Run the following to execute the function for Cu transition dataset. 

    mean_center(df, "5GB1_FM40_T0m_TR2", "5GB1_FM40_T180m_TR1") 
    
    """

    df2_TPM_values = df2_TPM.loc[:,first_data_column:last_data_column] #isolating the data values 
    df2_TPM_values_T = df2_TPM_values.T #transposing the data

    standard_scaler = StandardScaler(with_std=False)
    TPM_counts_mean_centered = standard_scaler.fit_transform(df2_TPM_values_T) #mean centering the data 

    TPM_counts_mean_centered = pd.DataFrame(TPM_counts_mean_centered) #back to Dataframe

    #transposing back to original form and reincerting indeces and columns 
    my_index = df2_TPM_values.index
    my_columns = df2_TPM_values.columns

    TPM_counts_mean_centered = TPM_counts_mean_centered.T
    TPM_counts_mean_centered.set_index(my_index, inplace=True)
    TPM_counts_mean_centered.columns = my_columns
    
    return TPM_counts_mean_centered

### Pairwise distance metric table, euclidean distance

In [6]:
def euclidean_distance(dataframe, first_data_column, last_data_column):
    
    """
    euclidean_distance(dataframe, 
                first_data_column, 
                last_data_column)
    
    Return a new dataframe - pairwise distance metric table, euclidean distance between every pair of rows. 
    
    Parameters
    ----------
    daraframe = dataframe object variable
    first_data_column = first column that contains actual data (first non categorical)
    last_data_column = last column taht contains actual data (last non categorigal column)

    Run the following to execute the function for Cu transition dataset. 

    euclidean_distance(df, "5GB1_FM40_T0m_TR2", "5GB1_FM40_T180m_TR1") 
    
    """
    
    
    df_values = dataframe.loc[:,first_data_column:last_data_column] #isolating the data values 

    df_euclidean_distance = pd.DataFrame(euclidean_distances(df_values))

    my_index = dataframe.index
    
    df_euclidean_distance = df_euclidean_distance.set_index(my_index)
    df_euclidean_distance.columns = my_index
    
    return df_euclidean_distance

### Calculating congruency table - pearson correlation across every pair of rows 

In [19]:
def congruency_table(df, 
         data_clm_strt, 
         data_clm_stop, 
         step, 
         mask_diagonal=False):
    
    """
    
    corr(df, data_clm_strt, data_clm_stop, step = len(df.columns), mask_diagonal=False)
    
    returns a new datafram - congruency table - a pairwise pearson correlation matrix for every row pair
    
    Parameters
    ----------
    
    df - dataframe argument - recommended to use TPM counts for RNAseq datasets. 
    data_clm_strt = first column that contains data to be processed 
    data_clm_stop = last column that contains data to be processed
    step = length of dataset 
    mask_diagonal = mask diagonal values which shoud come out as 1
    
    
    Run the following lines to execute the function for my data
    
    congruency_table(df1, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
    
    """
    
    df = df.loc[:, data_clm_strt: data_clm_stop] #isolating the rows that are relavent to us. 
    df = df.T 
    
    n = df.shape[0]

    def corr_closure(df):
        d = df.values
        sums = d.sum(0, keepdims=True)
        stds = d.std(0, keepdims=True)

        def corr_(k=0, l=10):
            d2 = d.T.dot(d[:, k:l])
            sums2 = sums.T.dot(sums[:, k:l])
            stds2 = stds.T.dot(stds[:, k:l])

            return pd.DataFrame((d2 - sums2 / n) / stds2 / n,
                                df.columns, df.columns[k:l])

        return corr_

    c = corr_closure(df)

    step = min(step, df.shape[1])

    tups = zip(range(0, n, step), range(step, n + step, step))

    corr_table = pd.concat([c(*t) for t in tups], axis=1)
    
    corr_table = corr_table.fillna(value = 0) #there are 94 NA values - filling them with zeros 

    if mask_diagonal:
        np.fill_diagonal(corr_table.values, np.nan)

    return corr_table

### Script to tie it all together 

In [8]:
df1_raw_FM40 = raw_data_cleanup("5G_counts.tsv")


columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

df2_TPM = TPM_counts(df1_raw_FM40, "start_coord", "end_coord",columns)  #TPM counts
df2_TPM_log2 = log_2_transform(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM log 2 transformed 
df2_TPM_mean = mean_center(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM mean centered 

df3_pearson_r = congruency_table(df2_TPM, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1", step = df2_TPM.shape[0])
df3_euclidean_mean = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
df3_euclidean_log2 = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1" )

print("The shape of the TPM table is ", df2_TPM.shape)
print("The shape of the pearson_r matrix is ", df3_pearson_r.shape)




5G_counts.tsv was located in the directory
5G_counts.tsv was imported into dataframe
QC columns were removed from dataframe
All non FM40 data were removed from dataframe
All FM40 columns were sorted by timecourse sequence
Clean-up of raw data complete
The shape of the TPM table is  (4593, 16)
The shape of the pearson_r matrix is  (4593, 4593)


In [12]:

df3_pearson_r

locus_tag,MBURv2_100001,MBURv2_100002,MBURv2_100003,MBURv2_10001,MBURv2_10002,MBURv2_10003,MBURv2_10004,MBURv2_10005,MBURv2_10006,MBURv2_10007,...,MBURv2_tRNA40,MBURv2_tRNA41,MBURv2_tRNA42,MBURv2_tRNA43,MBURv2_tRNA44,MBURv2_tRNA5,MBURv2_tRNA6,MBURv2_tRNA7,MBURv2_tRNA8,MBURv2_tRNA9
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MBURv2_100001,1.000000,0.217566,0.109381,-0.474350,-0.003526,0.827677,0.169594,0.397942,0.241800,0.176126,...,-0.112140,-0.085500,0.174863,-0.087777,0.040370,-0.081708,-0.387625,0.253373,-0.292786,
MBURv2_100002,0.217566,1.000000,-0.161280,0.338414,0.227931,0.038144,0.145587,0.125226,0.295602,0.389702,...,-0.094513,-0.001884,-0.271298,-0.316040,-0.245158,-0.101177,-0.066556,-0.233483,0.089748,
MBURv2_100003,0.109381,-0.161280,1.000000,0.381790,-0.655566,0.448473,0.255266,-0.329347,-0.616879,-0.637410,...,-0.643614,-0.672892,-0.326025,-0.354793,-0.557736,-0.613665,-0.722687,-0.451046,-0.638001,
MBURv2_10001,-0.474350,0.338414,0.381790,1.000000,-0.521635,-0.433470,0.201369,-0.746353,-0.506753,-0.418380,...,-0.587785,-0.493777,-0.646774,-0.687997,-0.786601,-0.576052,-0.455976,-0.800309,-0.345347,
MBURv2_10002,-0.003526,0.227931,-0.655566,-0.521635,1.000000,-0.205316,-0.127211,0.481309,0.911057,0.914927,...,0.899117,0.824316,0.664629,0.669732,0.759307,0.785206,0.755740,0.517857,0.882926,
MBURv2_10003,0.827677,0.038144,0.448473,-0.433470,-0.205316,1.000000,0.290024,0.479451,-0.081668,-0.155564,...,-0.284851,-0.244719,0.041720,-0.050825,0.045273,-0.151867,-0.361596,0.295548,-0.386629,
MBURv2_10004,0.169594,0.145587,0.255266,0.201369,-0.127211,0.290024,1.000000,-0.243709,0.021500,-0.030006,...,-0.269599,0.008931,-0.110992,-0.486916,-0.013805,0.007565,-0.081912,0.055530,0.023618,
MBURv2_10005,0.397942,0.125226,-0.329347,-0.746353,0.481309,0.479451,-0.243709,1.000000,0.311983,0.288824,...,0.493105,0.480835,0.481567,0.668817,0.681721,0.569885,0.496027,0.788658,0.382163,
MBURv2_10006,0.241800,0.295602,-0.616879,-0.506753,0.911057,-0.081668,0.021500,0.311983,1.000000,0.989565,...,0.718028,0.648873,0.530855,0.397743,0.585765,0.577010,0.518557,0.360313,0.680269,
MBURv2_10007,0.176126,0.389702,-0.637410,-0.418380,0.914927,-0.155564,-0.030006,0.288824,0.989565,1.000000,...,0.696449,0.619194,0.457541,0.366505,0.521468,0.535687,0.519063,0.280290,0.680160,


In [17]:
df2_TPM_values = df2_TPM.loc[:,"5GB1_FM40_T0m_TR2":"5GB1_FM40_T180m_TR1"].T

In [18]:
df2_TPM_values.corr()

locus_tag,MBURv2_100001,MBURv2_100002,MBURv2_100003,MBURv2_10001,MBURv2_10002,MBURv2_10003,MBURv2_10004,MBURv2_10005,MBURv2_10006,MBURv2_10007,...,MBURv2_tRNA40,MBURv2_tRNA41,MBURv2_tRNA42,MBURv2_tRNA43,MBURv2_tRNA44,MBURv2_tRNA5,MBURv2_tRNA6,MBURv2_tRNA7,MBURv2_tRNA8,MBURv2_tRNA9
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MBURv2_100001,1.000000,0.217566,0.109381,-0.474350,-0.003526,0.827677,0.169594,0.397942,0.241800,0.176126,...,-0.112140,-0.085500,0.174863,-0.087777,0.040370,-0.081708,-0.387625,0.253373,-0.292786,
MBURv2_100002,0.217566,1.000000,-0.161280,0.338414,0.227931,0.038144,0.145587,0.125226,0.295602,0.389702,...,-0.094513,-0.001884,-0.271298,-0.316040,-0.245158,-0.101177,-0.066556,-0.233483,0.089748,
MBURv2_100003,0.109381,-0.161280,1.000000,0.381790,-0.655566,0.448473,0.255266,-0.329347,-0.616879,-0.637410,...,-0.643614,-0.672892,-0.326025,-0.354793,-0.557736,-0.613665,-0.722687,-0.451046,-0.638001,
MBURv2_10001,-0.474350,0.338414,0.381790,1.000000,-0.521635,-0.433470,0.201369,-0.746353,-0.506753,-0.418380,...,-0.587785,-0.493777,-0.646774,-0.687997,-0.786601,-0.576052,-0.455976,-0.800309,-0.345347,
MBURv2_10002,-0.003526,0.227931,-0.655566,-0.521635,1.000000,-0.205316,-0.127211,0.481309,0.911057,0.914927,...,0.899117,0.824316,0.664629,0.669732,0.759307,0.785206,0.755740,0.517857,0.882926,
MBURv2_10003,0.827677,0.038144,0.448473,-0.433470,-0.205316,1.000000,0.290024,0.479451,-0.081668,-0.155564,...,-0.284851,-0.244719,0.041720,-0.050825,0.045273,-0.151867,-0.361596,0.295548,-0.386629,
MBURv2_10004,0.169594,0.145587,0.255266,0.201369,-0.127211,0.290024,1.000000,-0.243709,0.021500,-0.030006,...,-0.269599,0.008931,-0.110992,-0.486916,-0.013805,0.007565,-0.081912,0.055530,0.023618,
MBURv2_10005,0.397942,0.125226,-0.329347,-0.746353,0.481309,0.479451,-0.243709,1.000000,0.311983,0.288824,...,0.493105,0.480835,0.481567,0.668817,0.681721,0.569885,0.496027,0.788658,0.382163,
MBURv2_10006,0.241800,0.295602,-0.616879,-0.506753,0.911057,-0.081668,0.021500,0.311983,1.000000,0.989565,...,0.718028,0.648873,0.530855,0.397743,0.585765,0.577010,0.518557,0.360313,0.680269,
MBURv2_10007,0.176126,0.389702,-0.637410,-0.418380,0.914927,-0.155564,-0.030006,0.288824,0.989565,1.000000,...,0.696449,0.619194,0.457541,0.366505,0.521468,0.535687,0.519063,0.280290,0.680160,
