In [1]:
import os.path
import pandas as pd

import re #regular expression matching for removing unwanted columns by name 
import natsort as ns #3rd party package for natural sorting 

In [2]:
def raw_data_cleanup(filename):
    """
    Imports RNAseq .csv file and does basic clean up of "FM40" 
        -sorts FM40 timecourse sequence chronologically
        -removes all QC data and non FM40 columns
        -returns dataframe with locus tag set as index
    
    """

    if os.path.isfile(filename):
        print("{} was located in the directory".format(filename))
                
#import the data
        data0_raw = pd.read_csv(filename, sep = "\t") 
        print("{} was imported into dataframe".format(filename))
        
#removing all QC data
        data1_noQC = data0_raw.select(lambda x: not re.search("QC", x), axis = 1) 
        print("QC columns were removed from dataframe")

#removing all non FM40 data
        data2_FM40only = data1_noQC.select(lambda x: re.search("FM40", x), axis = 1)
        print("All non FM40 data were removed from dataframe")
        
#naturally sorting FM40 data by columns 
        cols = list(ns.natsorted(data2_FM40only.columns))
        data3_sorted=data2_FM40only[cols]
        print("All FM40 columns were sorted by timecourse sequence")
        
#adding the descriptor columns back to FM40
        qualitative = data0_raw.loc[:,"locus_tag":"translation"]
        data4_sorted = pd.concat([qualitative, data3_sorted], axis = 1)

#setting locus tag to be the index
        data5_index = data4_sorted.set_index("locus_tag")
        
    
        print("Clean-up of raw data complete")
        return data5_index
        
    else:
        print("{} does not exist in directory. Function was not complete.".format(filename))
        return
    

In [3]:
df1 = raw_data_cleanup("5G_counts.tsv")

5G_counts.tsv was located in the directory
5G_counts.tsv was imported into dataframe
QC columns were removed from dataframe
All non FM40 data were removed from dataframe
All FM40 columns were sorted by timecourse sequence
Clean-up of raw data complete


### Calculating TPM counts 

In [4]:
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1', 
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

#df1.loc[]

In [5]:
def TPM_counts(dataframe, 
              gene_start,
              gene_stop,
              columns):
    
    """           
    TPM_counts(dataframe, gene_start, gene_stop, columns):
    
    Parameters
    ----------
    daraframe = dataframe object variable 
    gene_start = string with column name containing gene start coordinate 
    gene_stop = string with column name containing gene stop coordinate 
    columns = list of strings of column names to be converted to TPM
    """
    
    #create empty dataframe 
    gene_length = pd.DataFrame()
    
    #gene length in kilo base pairs as new column
    gene_length["gene_length"] = (dataframe[gene_stop]- dataframe[gene_start] + 1)/1000   
    
    #normalize read counts by gene length in kilo base pairs
    RPK = dataframe.loc[:,columns].div(gene_length.gene_length, axis=0) 
    
    #creating a series with the sums of each FM40 column / 1,000,000
    norm_sum = RPK.sum(axis=0)/1000000 
    norm_sum1 = pd.Series.to_frame(norm_sum)
    norm_sum2 = norm_sum1.T
    
    #dividing by the the total transcript counts in each repicate
    TPM = RPK.div(norm_sum2.ix[0]) 
    
    dataframe.loc[:,columns] = TPM
    
    return dataframe 


In [6]:
df1 = TPM_counts(df1,"start_coord","end_coord",columns)

In [7]:
df1

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,24.037381,19.388720,19.472578,20.618024,22.190693,20.189929,23.793199,21.197329
MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,7.344755,6.717352,3.700437,5.487055,3.996920,5.438920,4.915027,4.634447
MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,5.293517,3.812551,8.308673,2.372780,3.703710,2.939957,5.977735,0.939415
MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,14.858356,21.889301,17.962332,17.028791,13.191805,11.878101,11.058811,7.491024
MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,89.572248,87.557206,79.063709,99.523823,76.788120,104.245961,95.143906,105.923083
MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,113.068430,88.417142,98.559710,86.786918,104.395747,100.544474,111.386799,95.223404
MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,25.648352,24.401808,24.747034,26.128833,25.869279,27.132933,25.879122,21.871753
MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,22.297302,20.984785,19.619608,18.773890,21.282302,23.224037,22.635527,23.842506
MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,36.396059,26.023126,22.323652,44.069260,24.098416,39.400048,36.049981,41.600548
MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,20.143340,14.745406,11.247106,24.267950,11.512615,21.225053,18.401381,22.889647


In [None]:
type(df1)

In [None]:
strengths = TPM_counts(df1,"start_coord","end_coord",columns)

In [None]:
df_A = pd.DataFrame({"A": [1,2,3,4,5,6],
                    "B": [10,20,30,40,50,60]},
                   index = [1,1,1,2,2,2])

In [None]:
df_A

In [None]:
df_B = pd.DataFrame({"divide_by": [0.5,10]},
                    index = [1,2])

In [None]:
df_B

In [None]:
result = df_A.div(df_B, axis = 0)

In [None]:
result