In [1]:
import os.path
import pandas as pd
import numpy as np

import hdbscan
import time
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances

import re #regular expression matching for removing unwanted columns by name 
import natsort as ns #3rd party package for natural sorting 

In [2]:
def raw_data_cleanup(filename):
    
    """
    Imports RNAseq .csv file and does basic clean up of "FM40" 
        -sorts FM40 timecourse sequence chronologically
        -removes all QC data and non FM40 columns
        -returns dataframe with locus tag set as index
    """

    if os.path.isfile(filename):
        print("{} was located in the directory".format(filename))
                
#import the data
        data0_raw = pd.read_csv(filename, sep = "\t") 
        print("{} was imported into dataframe".format(filename))
        
#removing all QC data
        data1_noQC = data0_raw.select(lambda x: not re.search("QC", x), axis = 1) 
        print("QC columns were removed from dataframe")

#removing all non FM40 data
        data2_FM40only = data1_noQC.select(lambda x: re.search("FM40", x), axis = 1)
        print("All non FM40 data were removed from dataframe")
        
#naturally sorting FM40 data by columns 
        cols = list(ns.natsorted(data2_FM40only.columns))
        data3_sorted=data2_FM40only[cols]
        print("All FM40 columns were sorted by timecourse sequence")
        
#adding the descriptor columns back to FM40
        qualitative = data0_raw.loc[:,"locus_tag":"translation"]
        data4_sorted = pd.concat([qualitative, data3_sorted], axis = 1)

#setting locus tag to be the index
        data5_index = data4_sorted.set_index("locus_tag")
        
    
        print("Clean-up of raw data complete")
        return data5_index
        
    else:
        print("{} does not exist in directory. Function was not complete.".format(filename))
        return
    

In [3]:
def TPM_counts(dataframe, 
              gene_start,
              gene_stop,
              columns):
    
    """
    TPM_counts(dataframe, gene_start, gene_stop, columns):

    returns a dataframe with TPM instead of reads

    Parameters
    ----------
    daraframe = dataframe object variable
    gene_start = string with column name containing gene start coordinate
    gene_stop = string with column name containing gene stop coordinate
    columns = list of strings of column names to be converted to TPM


    Run the following two lines to properly execute this function:

    columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

    TPM_counts(df,"start_coord","end_coord",columns)
    """
    
    #create empty dataframe 
    gene_length = pd.DataFrame()
    
    #gene length in kilo base pairs as new column
    gene_length["gene_length"] = (dataframe[gene_stop]- dataframe[gene_start] + 1)/1000   
    
    #normalize read counts by gene length in kilo base pairs
    RPK = dataframe.loc[:,columns].div(gene_length.gene_length, axis=0) 
    
    #creating a series with the sums of each FM40 column / 1,000,000
    norm_sum = RPK.sum(axis=0)/1000000 
    norm_sum1 = pd.Series.to_frame(norm_sum)
    norm_sum2 = norm_sum1.T
    
    #dividing by the the total transcript counts in each repicate
    TPM = RPK.div(norm_sum2.ix[0]) 
    
    dataframe.loc[:,columns] = TPM
    
    return dataframe 



### Log2 fold transfrom of the TPM data.

In [4]:
def log_2_transform(dataframe,
                    first_data_column,
                    last_data_column):
                  
    """
    log_2_transform(dataframe, 
                    first_data_column, 
                    last_data_column)
    
    Return a new dataframe with the range of data columns log2 transformed. 
    *all zero values are changed to 1 (yield 0 after transform)
    *all values less than 1 are changed to 1 (yield 0 after transform)
    
    Parameters
    ----------
    daraframe = dataframe object variable
    first_data_column = first column that contains actual data (first non categorical)
    last_data_column = last column taht contains actual data (last non categorigal column)

    Run the following to execute the function for Cu transition dataset. 

    log_2_transform(df, "5GB1_FM40_T0m_TR2", "5GB1_FM40_T180m_TR1") 
    
    """
    
    df_data = dataframe.loc[:,first_data_column:last_data_column] #isolate the data
    
    df_data = df_data.replace(0,1) #replace all zeros with 1s
    
    df_data[df_data<1] = 1 #replace all values less than 1 with 1
    
    df_data_log2 = df_data.apply(np.log2)
    
    return df_data_log2

### Mean center the data 

In [5]:
def mean_center(df, first_data_column, last_data_column):       
        
        
    """
    mean_center(dataframe, 
                first_data_column, 
                last_data_column)
    
    Return a new dataframe with the range of data columns log2 transformed. 
    
    Parameters
    ----------
    daraframe = dataframe object variable
    first_data_column = first column that contains actual data (first non categorical)
    last_data_column = last column taht contains actual data (last non categorigal column)

    Run the following to execute the function for Cu transition dataset. 

    mean_center(df, "5GB1_FM40_T0m_TR2", "5GB1_FM40_T180m_TR1") 
    
    """

    df2_TPM_values = df2_TPM.loc[:,first_data_column:last_data_column] #isolating the data values 
    df2_TPM_values_T = df2_TPM_values.T #transposing the data

    standard_scaler = StandardScaler(with_std=False)
    TPM_counts_mean_centered = standard_scaler.fit_transform(df2_TPM_values_T) #mean centering the data 

    TPM_counts_mean_centered = pd.DataFrame(TPM_counts_mean_centered) #back to Dataframe

    #transposing back to original form and reincerting indeces and columns 
    my_index = df2_TPM_values.index
    my_columns = df2_TPM_values.columns

    TPM_counts_mean_centered = TPM_counts_mean_centered.T
    TPM_counts_mean_centered.set_index(my_index, inplace=True)
    TPM_counts_mean_centered.columns = my_columns
    
    return TPM_counts_mean_centered

### Pariwise distance metric table - euclidean distance 

In [6]:
def euclidean_distance(dataframe, first_data_column, last_data_column):
    
    """
    euclidean_distance(dataframe, 
                first_data_column, 
                last_data_column)
    
    Return a new dataframe - pairwise distance metric table, euclidean distance between every pair of rows. 
    
    Parameters
    ----------
    daraframe = dataframe object variable
    first_data_column = first column that contains actual data (first non categorical)
    last_data_column = last column taht contains actual data (last non categorigal column)

    Run the following to execute the function for Cu transition dataset. 

    euclidean_distance(df, "5GB1_FM40_T0m_TR2", "5GB1_FM40_T180m_TR1") 
    
    """
    
    
    df_values = dataframe.loc[:,first_data_column:last_data_column] #isolating the data values 

    df_euclidean_distance = pd.DataFrame(euclidean_distances(df_values))

    my_index = dataframe.index
    
    df_euclidean_distance = df_euclidean_distance.set_index(my_index)
    df_euclidean_distance.columns = my_index
    
    return df_euclidean_distance

In [7]:
def congruency_table(df, 
         data_clm_strt, 
         data_clm_stop, 
         step, 
         mask_diagonal=False):
    
    """
    
    corr(df, data_clm_strt, data_clm_stop, step = len(df.columns), mask_diagonal=False)
    
    returns a new datafram - congruency table - a pairwise pearson correlation matrix for every row pair
    
    Parameters
    ----------
    
    df - dataframe argument - recommended to use TPM counts for RNAseq datasets. 
    data_clm_strt = first column that contains data to be processed 
    data_clm_stop = last column that contains data to be processed
    step = length of dataset 
    mask_diagonal = mask diagonal values which shoud come out as 1
    
    
    Run the following lines to execute the function for my data
    
    congruency_table(df1, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
    
    """
    
    df = df.loc[:, data_clm_strt: data_clm_stop] #isolating the rows that are relavent to us. 
    df = df.T 
    
    n = df.shape[0]

    def corr_closure(df):
        d = df.values
        sums = d.sum(0, keepdims=True)
        stds = d.std(0, keepdims=True)

        def corr_(k=0, l=10):
            d2 = d.T.dot(d[:, k:l])
            sums2 = sums.T.dot(sums[:, k:l])
            stds2 = stds.T.dot(stds[:, k:l])

            return pd.DataFrame((d2 - sums2 / n) / stds2 / n,
                                df.columns, df.columns[k:l])

        return corr_

    c = corr_closure(df)

    step = min(step, df.shape[1])

    tups = zip(range(0, n, step), range(step, n + step, step))

    corr_table = pd.concat([c(*t) for t in tups], axis=1)

    corr_table = corr_table.fillna(value = 0) #there are 94 NA values - filling them with zeros 
    
    if mask_diagonal:
        np.fill_diagonal(corr_table.values, np.nan)

    return corr_table

In [8]:
df1_raw_FM40 = raw_data_cleanup("5G_counts.tsv")


columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

df2_TPM = TPM_counts(df1_raw_FM40, "start_coord", "end_coord",columns)  #TPM counts
df2_TPM_log2 = log_2_transform(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM log 2 transformed 
df2_TPM_mean = mean_center(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM mean centered 

df3_pearson_r = congruency_table(df2_TPM, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1", step = df2_TPM.shape[0])
df3_euclidean_mean = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
df3_euclidean_log2 = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1" )

print("The shape of the TPM table is ", df2_TPM.shape)
print("The shape of the pearson_r matrix is ", df3_pearson_r.shape)



5G_counts.tsv was located in the directory
5G_counts.tsv was imported into dataframe
QC columns were removed from dataframe
All non FM40 data were removed from dataframe
All FM40 columns were sorted by timecourse sequence
Clean-up of raw data complete
The shape of the TPM table is  (4593, 16)
The shape of the pearson_r matrix is  (4593, 4593)


   ### Clustering pearsons_r with HDBSCAN

In [9]:
# Clustering the pearsons_R with N/A vlaues removed 

hdb_t1 = time.time()
hdb_pearson_r = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_pearson_r)
hdb_pearson_r_labels = hdb_pearson_r.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)

time to cluster 2.9324939250946045


In [10]:
print(np.unique(hdb_pearson_r_labels)) # unique bins, zero is noise
print(np.bincount(hdb_pearson_r_labels[hdb_pearson_r_labels!=-1])) 

[-1  0  1]
[4532   60]


In [11]:
pearson_clusters = {i: np.where(hdb_pearson_r_labels == i)[0] for i in range(2)}

In [12]:
df2_TPM.iloc[pearson_clusters[1],:]

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MBURv2_20054,conserved protein of unknown function,CDS,,MBURv2,148660,149958,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0
MBURv2_20174,fragment of bifunctional protein [Includes: pu...,CDS,,MBURv2,276069,276944,Evidence 2b : Function of strongly homologous ...,,0,0,0,0,0,0,0,0
MBURv2_20175,fragment of bifunctional protein [Includes: pu...,CDS,,MBURv2,276946,278265,Evidence 2b : Function of strongly homologous ...,,0,0,0,0,0,0,0,0
MBURv2_210020,protein of unknown function,CDS,,MBURv2,4117369,4117431,Evidence 5 : No homology to any previously rep...,,0,0,0,0,0,0,0,0
MBURv2_210100,conserved protein of unknown function,CDS,,MBURv2,4207166,4207459,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0
MBURv2_210106,conserved protein of unknown function,CDS,,MBURv2,4215558,4216046,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0
MBURv2_210193,conserved protein of unknown function,CDS,,MBURv2,4320868,4321203,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0
MBURv2_210222,fragment of dTDP-4-dehydrorhamnose reductase s...,CDS,rmlD,MBURv2,4355214,4355459,Evidence 2a : Function of homologous gene expe...,,0,0,0,0,0,0,0,0
MBURv2_210223,fragment of dTDP-4-dehydrorhamnose reductase s...,CDS,rmlD,MBURv2,4355477,4356100,Evidence 2a : Function of homologous gene expe...,,0,0,0,0,0,0,0,0
MBURv2_210226,conserved protein of unknown function,CDS,,MBURv2,4357559,4357930,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0


Looks like there are two clusters, some expression and zero expression across samples.

   ### Clustering mean centered euclidean distance with with HDBSCAN

In [13]:
# Clustering the mean centered euclidean distance of TPM counts 

hdb_t1 = time.time()
hdb_euclidean_mean = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_euclidean_mean)
hdb_euclidean_mean_labels = hdb_euclidean_mean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)

time to cluster 2.549302101135254


In [14]:
print(np.unique(hdb_euclidean_mean_labels))
print(np.bincount(hdb_euclidean_mean_labels[hdb_euclidean_mean_labels!=-1]))

[-1  0  1]
[26 67]


In [15]:
euclidean_mean_clusters = {i: np.where(hdb_euclidean_mean_labels == i)[0] for i in range(2)}
df2_TPM.iloc[euclidean_mean_clusters[1],:]

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MBURv2_16s_rRNA_2,ribosomal RNA 16s_rRNA,rRNA,,MBURv2,3349450,3350972,,,0,0,0,0,0,0,0,0
MBURv2_16s_rRNA_3,ribosomal RNA 16s_rRNA,rRNA,,MBURv2,2890691,2892213,,,0,0,0,0,0,0,0,0
MBURv2_190002,Fap1 adhesin (fragment),CDS,,MBURv2,3939333,3942758,,,0,0,0,0,0,0,0,0
MBURv2_190028,protein of unknown function,CDS,,MBURv2,3972750,3972920,Evidence 5 : No homology to any previously rep...,,0,0,0,0,0,0,0,0
MBURv2_190038,protein of unknown function,CDS,,MBURv2,3978954,3979127,Evidence 5 : No homology to any previously rep...,,0,0,0,0,0,0,0,0
MBURv2_20037,fragment of bifunctional protein [Includes: ri...,CDS,rnhA-dnaQ,MBURv2,126207,126650,Evidence 2a : Function of homologous gene expe...,,0,0,0,0,0,0,0,0
MBURv2_20040,fragment of bifunctional protein [Includes: ri...,CDS,rnhA-dnaQ,MBURv2,130762,131481,Evidence 2a : Function of homologous gene expe...,,0,0,0,0,0,0,0,0
MBURv2_20054,conserved protein of unknown function,CDS,,MBURv2,148660,149958,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0
MBURv2_20174,fragment of bifunctional protein [Includes: pu...,CDS,,MBURv2,276069,276944,Evidence 2b : Function of strongly homologous ...,,0,0,0,0,0,0,0,0
MBURv2_20175,fragment of bifunctional protein [Includes: pu...,CDS,,MBURv2,276946,278265,Evidence 2b : Function of strongly homologous ...,,0,0,0,0,0,0,0,0


Looks like 2 clusters - both with zero expression. 

looks like wether it is a numpy array or pandas dataframe, the result is the same. lets now try to get index of the clustered points. 

   ### Clustering log transformed euclidean distance with with HDBSCAN

In [16]:
# Clustering the log2 transformed euclidean distance of TPM counts 

hdb_t1 = time.time()
hdb_euclidean_log2 = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_euclidean_log2)
hdb_euclidean_log2_labels = hdb_euclidean_log2.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)

time to cluster 3.291189193725586


In [17]:
print(np.unique(hdb_euclidean_log2_labels))
print(np.bincount(hdb_euclidean_log2_labels[hdb_euclidean_log2_labels!=-1]))

[-1  0  1]
[26 67]


In [18]:
euclidean_log2_clusters = {i: np.where(hdb_euclidean_log2_labels == i)[0] for i in range(2)}
df2_TPM.iloc[euclidean_log2_clusters[1],:]

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MBURv2_16s_rRNA_2,ribosomal RNA 16s_rRNA,rRNA,,MBURv2,3349450,3350972,,,0,0,0,0,0,0,0,0
MBURv2_16s_rRNA_3,ribosomal RNA 16s_rRNA,rRNA,,MBURv2,2890691,2892213,,,0,0,0,0,0,0,0,0
MBURv2_190002,Fap1 adhesin (fragment),CDS,,MBURv2,3939333,3942758,,,0,0,0,0,0,0,0,0
MBURv2_190028,protein of unknown function,CDS,,MBURv2,3972750,3972920,Evidence 5 : No homology to any previously rep...,,0,0,0,0,0,0,0,0
MBURv2_190038,protein of unknown function,CDS,,MBURv2,3978954,3979127,Evidence 5 : No homology to any previously rep...,,0,0,0,0,0,0,0,0
MBURv2_20037,fragment of bifunctional protein [Includes: ri...,CDS,rnhA-dnaQ,MBURv2,126207,126650,Evidence 2a : Function of homologous gene expe...,,0,0,0,0,0,0,0,0
MBURv2_20040,fragment of bifunctional protein [Includes: ri...,CDS,rnhA-dnaQ,MBURv2,130762,131481,Evidence 2a : Function of homologous gene expe...,,0,0,0,0,0,0,0,0
MBURv2_20054,conserved protein of unknown function,CDS,,MBURv2,148660,149958,Evidence 4 : Homologs of previously reported g...,,0,0,0,0,0,0,0,0
MBURv2_20174,fragment of bifunctional protein [Includes: pu...,CDS,,MBURv2,276069,276944,Evidence 2b : Function of strongly homologous ...,,0,0,0,0,0,0,0,0
MBURv2_20175,fragment of bifunctional protein [Includes: pu...,CDS,,MBURv2,276946,278265,Evidence 2b : Function of strongly homologous ...,,0,0,0,0,0,0,0,0
