# Time Course Transcriptomics for Cu-Induced transition in 5GB1

In [1]:
import pandas as pd
import natsort as ns #3rd party package for natural sorting 
import re

In [2]:
data = pd.read_csv("5G_counts.tsv", sep = "\t")

In [3]:
columns_list = list(range(0,9)) + list(range(20,42)) #creating a list of columns that I care about (see below)
data_1 = data.iloc[:, columns_list] #taking only 0-8 and 20-42 columns removing old FM runs 

In [4]:
#Want to sort the data columns (20 - 42) in their timely order. Will split into 2 dataframes, sort, then put together.

first_8 = data_1.iloc[:, 0:9] #new data frame with first 9 columns
remaining_data = data_1.iloc[:,9:] # new data frame with remaining columns (to be sorted)


cols = list(ns.natsorted(remaining_data.columns)) #using natural sort package
newdf=remaining_data[cols]


In [5]:
data_2 = pd.concat([first_8, newdf], axis = 1) #ok so now combined first 8 columns with FM34 and FM40 columns 

In [6]:
#the columns still contain many QC runs lets get rid of them (see aside for removing columins with "QC" in them)
list(data_2.columns)

['locus_tag',
 'product',
 'type',
 'gene_symbol',
 'locus',
 'start_coord',
 'end_coord',
 'note',
 'translation',
 '5GB1_FM34_T0_TR1_QC',
 '5GB1_FM34_T3_TR3_QC',
 '5GB1_FM34_T4_TR3_QC',
 '5GB1_FM34_T5_TR2_QC',
 '5GB1_FM34_T6_TR3_QC',
 '5GB1_FM34_T7_TR3_QC',
 '5GB1_FM34_T8_TR1_QC',
 '5GB1_FM40_T0_TR1_QC',
 '5GB1_FM40_T0m_TR2',
 '5GB1_FM40_T10m_TR3',
 '5GB1_FM40_T10m_TR3_QC',
 '5GB1_FM40_T20m_TR2',
 '5GB1_FM40_T40m_TR1',
 '5GB1_FM40_T40m_TR1_QC',
 '5GB1_FM40_T60m_TR1',
 '5GB1_FM40_T60m_TR1_QC',
 '5GB1_FM40_T90m_TR2',
 '5GB1_FM40_T90m_TR2_QC',
 '5GB1_FM40_T150m_TR1_QC',
 '5GB1_FM40_T150m_TR1_remake',
 '5GB1_FM40_T180m_TR1',
 '5GB1_FM40_T180m_TR1_QC']

In [7]:
data_3 = data_2.select(lambda x: not re.search("QC", x), axis = 1)

In [8]:
data_3

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
0,MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,648,254,301,248,513,294,852,322
1,MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,45,20,13,15,21,18,40,16
2,MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,20,7,18,4,12,6,30,2
3,MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,88,63,61,45,67,38,87,25
4,MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,1061,504,537,526,780,667,1497,707
5,MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,2771,1053,1385,949,2194,1331,3626,1315
6,MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,385,178,213,175,333,220,516,185
7,MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,551,252,278,207,451,310,743,332
8,MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,472,164,166,255,268,276,621,304
9,MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,253,90,81,136,124,144,307,162


In [9]:
#OK now need to create a TPM counts for all the columns 
data_3["gene_length"] = (data_3["end_coord"]-data_3["start_coord"] + 1)/1000 #gene length in kilo base pair  

In [10]:
data_3

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1,gene_length
0,MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,648,254,301,248,513,294,852,322,0.792
1,MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,45,20,13,15,21,18,40,16,0.180
2,MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,20,7,18,4,12,6,30,2,0.111
3,MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,88,63,61,45,67,38,87,25,0.174
4,MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,1061,504,537,526,780,667,1497,707,0.348
5,MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,2771,1053,1385,949,2194,1331,3626,1315,0.720
6,MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,385,178,213,175,333,220,516,185,0.441
7,MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,551,252,278,207,451,310,743,332,0.726
8,MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,472,164,166,255,268,276,621,304,0.381
9,MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,253,90,81,136,124,144,307,162,0.369


In [11]:
#before moving on, I want to see the stats for the gene length column (min, max, mean, etc.)
data_3.gene_length.describe()

count    4593.000000
mean        0.920235
std         0.777701
min         0.063000
25%         0.390000
50%         0.738000
75%         1.218000
max        10.320000
Name: gene_length, dtype: float64

In [12]:
#lets find the loc range of the columns I want to divide and my gene length column
print(data_3.columns.get_loc("gene_length")) # need to devide all FM40 columns by this column 
print(data_3.columns.get_loc("5GB1_FM40_T0m_TR2")) # this is where my range starts. so columns [9-16]/[17]

17
9


In [13]:
RPK = data_3.iloc[:,9:17].div(data_3.gene_length, axis=0) #it is 9-17 because the last value is not inclusive. 

In [14]:
data_4 = pd.concat([first_8, RPK], axis = 1)

In [15]:
data_4

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
0,MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,818.181818,320.707071,380.050505,313.131313,647.727273,371.212121,1075.757576,406.565657
1,MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,250.000000,111.111111,72.222222,83.333333,116.666667,100.000000,222.222222,88.888889
2,MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,180.180180,63.063063,162.162162,36.036036,108.108108,54.054054,270.270270,18.018018
3,MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,505.747126,362.068966,350.574713,258.620690,385.057471,218.390805,500.000000,143.678161
4,MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,3048.850575,1448.275862,1543.103448,1511.494253,2241.379310,1916.666667,4301.724138,2031.609195
5,MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,3848.611111,1462.500000,1923.611111,1318.055556,3047.222222,1848.611111,5036.111111,1826.388889
6,MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,873.015873,403.628118,482.993197,396.825397,755.102041,498.866213,1170.068027,419.501134
7,MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,758.953168,347.107438,382.920110,285.123967,621.212121,426.997245,1023.415978,457.300275
8,MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,1238.845144,430.446194,435.695538,669.291339,703.412073,724.409449,1629.921260,797.900262
9,MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,685.636856,243.902439,219.512195,368.563686,336.043360,390.243902,831.978320,439.024390


In [16]:
data_4.iloc[:,9:17].sum() #the sum of reads normalized to gene length is different between samples. 

5GB1_FM40_T0m_TR2             34037892.982884
5GB1_FM40_T10m_TR3            16540910.063731
5GB1_FM40_T20m_TR2            19517215.620340
5GB1_FM40_T40m_TR1            15187260.790192
5GB1_FM40_T60m_TR1            29189141.471211
5GB1_FM40_T90m_TR2            18386004.046463
5GB1_FM40_T150m_TR1_remake    45212818.536209
5GB1_FM40_T180m_TR1           19180042.089329
dtype: float64

In [17]:
norm_sum = data_4.iloc[:,9:17].sum(axis=0)/1000000 #creating a series with the sums of each FM40 column / 1,000,000
norm_sum = pd.Series.to_frame(norm_sum) #converting this series into a dataframe 
norm_sum = norm_sum.T #transposing the dataframe so that there is one value per column

In [18]:
norm_sum

Unnamed: 0,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
0,34.037893,16.54091,19.517216,15.187261,29.189141,18.386004,45.212819,19.180042


In [19]:
TPM = data_4.iloc[:,9:17].div(norm_sum.ix[0]) #dividing FM40 columns by the the total transcript counts in each repicate

In [20]:
data_5 = pd.concat([first_8, TPM], axis = 1) #this is the TPM!

In [21]:
data_5.iloc[:,9:17].sum() # can check that the sum total of each colum is identical. Now can do stats! 

5GB1_FM40_T0m_TR2             1000000
5GB1_FM40_T10m_TR3            1000000
5GB1_FM40_T20m_TR2            1000000
5GB1_FM40_T40m_TR1            1000000
5GB1_FM40_T60m_TR1            1000000
5GB1_FM40_T90m_TR2            1000000
5GB1_FM40_T150m_TR1_remake    1000000
5GB1_FM40_T180m_TR1           1000000
dtype: float64

In [22]:
data_5

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM40_T0m_TR2,5GB1_FM40_T10m_TR3,5GB1_FM40_T20m_TR2,5GB1_FM40_T40m_TR1,5GB1_FM40_T60m_TR1,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1
0,MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,24.037381,19.388720,19.472578,20.618024,22.190693,20.189929,23.793199,21.197329
1,MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,7.344755,6.717352,3.700437,5.487055,3.996920,5.438920,4.915027,4.634447
2,MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,5.293517,3.812551,8.308673,2.372780,3.703710,2.939957,5.977735,0.939415
3,MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,14.858356,21.889301,17.962332,17.028791,13.191805,11.878101,11.058811,7.491024
4,MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,89.572248,87.557206,79.063709,99.523823,76.788120,104.245961,95.143906,105.923083
5,MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,113.068430,88.417142,98.559710,86.786918,104.395747,100.544474,111.386799,95.223404
6,MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,25.648352,24.401808,24.747034,26.128833,25.869279,27.132933,25.879122,21.871753
7,MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,22.297302,20.984785,19.619608,18.773890,21.282302,23.224037,22.635527,23.842506
8,MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,36.396059,26.023126,22.323652,44.069260,24.098416,39.400048,36.049981,41.600548
9,MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,20.143340,14.745406,11.247106,24.267950,11.512615,21.225053,18.401381,22.889647


### Aside, how to keep only columns with FM40, and FM34

In [None]:
#can individualy identify 1 column into a new dataframe
df = data["locus_tag"]
df

In [None]:
# for multiple column selection must use the __getitem__ syntax [] 
df1 = data[["locus_tag","type"]]  
df1
#Dont want to manually enter all the column names 

### identifying the column index in order to remove unnecessary columns from data 

In [None]:
list(data.columns)

In [None]:
print(data.columns.get_loc("translation"))
print(data.columns.get_loc("5GB1_FM40_T0m_TR2"))
print(data.columns.get_loc("5GB1_FM40_T90m_TR2_QC"))

In [None]:
columns_list = list(range(0,9)) + list(range(20,42))

In [None]:
columns_list

In [None]:
data_1 = data.iloc[:, columns_list] #slicing the column index the way I want. 

### Reordering certain columns alphabetically, only after the initial 0-8 columns. 

In [None]:
first_8 = data_1.iloc[:, 0:9] #new data frame with first 9 columns
remaining_data = data_1.iloc[:,9:] # new data frame with remaining columns (to be sorted)
remaining_data

In [None]:
sorted(remaining_data.columns)  #sorted is a python function that sorts your input (dont know by what criteria)

In [None]:
list(sorted(remaining_data.columns,key=str)) # will list sorted columns, but doesnt sort this naturally (150 before 40)

In [None]:
import natsort as ns #3rd party package for natural sorting 
list(ns.natsorted(remaining_data.columns)) #this works!

In [None]:
cols=list(ns.natsorted(remaining_data.columns)) #this works!

In [None]:
remaining_data[cols].head()

In [None]:
newdf=remaining_data[cols]

In [None]:
newdf.head()

In [None]:
ns.natsorted(remaining_data)#the problem with this package is that passing the object as an argument 
                                            #returns a list, and I cant use that list for the dataframe, I need index. 

In [None]:
remaining_data.loc("5GB1_Cu_transition_tim")

In [None]:
remaining_data.columns = ns.natsorted(remaining_data.columns) 

In [None]:
list(remaining_data.columns)

In [None]:
remaining_data["5GB1_FM40_T150m_TR1_remake"]

In [None]:
data["5GB1_FM40_T150m_TR1_remake"]

### Aside to remove all columns containing the string "QC" 


In [None]:
#many columns with QC runs. Gotta filter those out. 
list(data_2.columns) 

In [None]:
data_2.select(lambda x: not re.search("QC", x), axis = 1) #ok this is what I need, not lets break it down. 

In [None]:
# re is a regex (regular expression) module. It is useful in selecting strings or parts of strings. Here is an example. 

str_1 = 'an example word:cat12, word:cat!!, word:cattt165'
match = re.search(r'word:cat\d+', str_1) #the r ignores slashes (google education has a nice tutorial with re module)

if match:
    print("found", match.group())
    
else:
    print("did not find")


In [None]:
#lets find multiple words, the precious example on found 1

#say you have a text with many email addresses

str_2 = "purple alice@gmail.com, and many other like bob@yahoo.com and also a dishwasher"
emails = re.findall(r"\w+@\w+\.\w+", str_2)
for email in emails:
    print (email)


In [None]:
print(emails)

### Taking an aside to understanding iloc, loc, ix

In [None]:
data.ix[1:3] #supports mixed integer and label based access. It is
            #primarily label based, but will fall back to integer positional
            #access unless the corresponding axis is of integer type.

In [None]:
data.loc[1] #returns the first row, Note: "1" is
            #interpreted as a *label* of the index, and **never** as an
            # integer position along the index).
data.loc[0:2,"5GB1_FM34_T0_TR1_QC"] #the integers are interpreted as names, not positions. 

In [None]:
data.iloc[1:3] #returns the data for axis 0 = 1,2,3 (or the first three rows of data)
data.iloc[3] #returns the data for the third row 
data.iloc[[1,2,5]] #returns data for rows 1,2,5


In [None]:
type(data.iloc[4])