In [1]:
import pandas as pd
import re #regular expression matching for removing unwanted columns by name
import natsort as ns #3rd party package for natural sorting


In [2]:
df0_raw = pd.read_csv("5G_count_all.tsv", sep = '\t')

In [3]:
df0_raw

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM69_t2_TR1,...,5GB1_FM21_TR1,5GB1_FM21_TR2,5GB1_FM21_TR2_UW,5GB1_FM03_TR1_QC,5GB1_FM03_TR2_QC,5GB1_FM20_TR1_QC,5GB1_FM20_TR2_QC,5GB1_FM20_TR3_QC,5GB1_FM21_TR1_QC,5GB1_FM21_TR2_QC
0,MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,34,...,779,612,658,28,122,19,176,55,80,71
1,MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,1,...,56,59,54,0,4,1,12,4,7,7
2,MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,2,...,15,94,24,0,4,0,2,2,1,3
3,MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,4,...,167,124,155,2,16,4,49,8,13,22
4,MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,71,...,1356,1201,1211,57,180,24,227,116,166,136
5,MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,132,...,4493,2562,4006,119,449,69,846,360,421,432
6,MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,23,...,1315,842,1188,25,87,14,210,101,123,133
7,MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,36,...,1371,750,1240,29,113,15,270,90,111,139
8,MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,25,...,509,323,530,12,41,6,138,66,48,62
9,MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,11,...,670,423,655,17,44,9,149,58,61,64


# Removing columns

Need to remove the following columns manually 

[total, ribo0, br1_QC, br2_QC] - these dont exist in the dataframe 

[5GB1_ferm_WT_QC, 5GB1_ferm_Ack_QC, 5GB1C_latelog_vial_TR2_QC, 5GB1_FM58_Td20h_TR1_QC, 5GB1_FM58_Td32h_TR3_QC, 5GB1_LTrecycle_TR1_QC, 5GB1_LTrecycle_TR1,
5GB1_vial_wLa_TR3, 5GB1_vial_woLa_TR2]

Need to remove all QC runs except FM34 which we will keep

In [4]:
to_remove = ["5GB1_ferm_WT_QC", "5GB1_ferm_Ack_QC", "5GB1C_latelog_vial_TR2_QC", 
             "5GB1_FM58_Td20h_TR1_QC", "5GB1_FM58_Td32h_TR3_QC", "5GB1_LTrecycle_TR1_QC", "5GB1_LTrecycle_TR1",
             "5GB1_vial_wLa_TR3", "5GB1_vial_woLa_TR2"] 

In [5]:
#manually removing some unwanted columns 
df1_raw_filtered = df0_raw.drop(df0_raw.loc[:,to_remove].columns, axis = 1)

In [6]:
df1_raw_filtered

Unnamed: 0,locus_tag,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM69_t2_TR1,...,5GB1_FM21_TR1,5GB1_FM21_TR2,5GB1_FM21_TR2_UW,5GB1_FM03_TR1_QC,5GB1_FM03_TR2_QC,5GB1_FM20_TR1_QC,5GB1_FM20_TR2_QC,5GB1_FM20_TR3_QC,5GB1_FM21_TR1_QC,5GB1_FM21_TR2_QC
0,MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,34,...,779,612,658,28,122,19,176,55,80,71
1,MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,1,...,56,59,54,0,4,1,12,4,7,7
2,MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,2,...,15,94,24,0,4,0,2,2,1,3
3,MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,4,...,167,124,155,2,16,4,49,8,13,22
4,MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,71,...,1356,1201,1211,57,180,24,227,116,166,136
5,MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,132,...,4493,2562,4006,119,449,69,846,360,421,432
6,MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,23,...,1315,842,1188,25,87,14,210,101,123,133
7,MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,36,...,1371,750,1240,29,113,15,270,90,111,139
8,MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,25,...,509,323,530,12,41,6,138,66,48,62
9,MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,11,...,670,423,655,17,44,9,149,58,61,64


In [7]:
#Isolating FM34 - Cu transition 3+ hours 
df1a_FM34_only = df1_raw_filtered.select(lambda x: re.search("FM34", x), axis=1)


In [8]:
#Removing all QC runs 
df1_raw_filtered = df1_raw_filtered.select(lambda x: not re.search("QC", x), axis=1)

In [14]:
#naturally sorting the filtered columns
to_sort = df1_raw_filtered.loc[:,"5GB1_FM69_t2_TR1":]

cols = list(ns.natsorted(to_sort.columns))
cols_sorted = to_sort[cols]

#adding descriptive columns
qualitative = df1_raw_filtered.loc[:, "locus_tag":"translation"]
df1b_filtered_sorted = pd.concat([qualitative, cols_sorted], axis=1)

#isolating FM40 (to be added back later for particular order )
df1b_FM40_only = df1b_filtered_sorted.select(lambda x: re.search("FM40", x), axis=1)

#removing FM40 (to be added right back at the end)
df1b_filtered_sorted = df1b_filtered_sorted.select(lambda x: not re.search("FM40", x), axis=1)

#Adding FM40
cleaned_up = pd.concat([df1b_filtered_sorted, df1b_FM40_only], axis=1)

#adding FM34
df2_cleaned_up = pd.concat([cleaned_up, df1a_FM34_only], axis=1)

#setting locus tag as index
df2_cleaned_up = df2_cleaned_up.set_index("locus_tag")

In [15]:
df2_cleaned_up

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM12_TR1,5GB1_FM12_TR2,...,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1,5GB1_FM34_T0_TR1_QC,5GB1_FM34_T3_TR3_QC,5GB1_FM34_T4_TR3_QC,5GB1_FM34_T5_TR2_QC,5GB1_FM34_T6_TR3_QC,5GB1_FM34_T7_TR3_QC,5GB1_FM34_T8_TR1_QC
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,1913,1700,...,294,852,322,118,111,100,90,56,117,124
MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,177,161,...,18,40,16,5,3,4,4,1,10,3
MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,54,47,...,6,30,2,5,3,3,2,0,2,0
MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,349,319,...,38,87,25,6,9,23,6,5,9,8
MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,1833,1199,...,667,1497,707,290,277,196,173,132,192,241
MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,8721,7714,...,1331,3626,1315,553,511,510,329,277,408,519
MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,1862,1675,...,220,516,185,72,61,91,75,29,68,74
MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,2036,1838,...,310,743,332,67,65,68,75,37,66,73
MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,1869,1171,...,276,621,304,160,69,90,57,45,120,84
MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,1248,866,...,144,307,162,45,25,32,16,12,37,22


# Running TPM function on all the data columns


In [20]:
cd executible/

/Users/alexeygilman/repos/Cu_transition_time_course-/executible


In [21]:
%run Cu_transition_functionalized.py

In [23]:
columns = df2_cleaned_up.loc[:,"5GB1_FM12_TR1":].columns

In [25]:
df3_TPM = TPM_counts(df2_cleaned_up,"start_coord","end_coord",columns)

In [26]:
df3_TPM

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM12_TR1,5GB1_FM12_TR2,...,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1,5GB1_FM34_T0_TR1_QC,5GB1_FM34_T3_TR3_QC,5GB1_FM34_T4_TR3_QC,5GB1_FM34_T5_TR2_QC,5GB1_FM34_T6_TR3_QC,5GB1_FM34_T7_TR3_QC,5GB1_FM34_T8_TR1_QC
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,17.582218,17.547831,...,20.189929,23.793199,21.197329,21.782989,21.127986,18.032260,21.768993,23.933245,25.147654,24.738111
MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,7.157884,7.312284,...,5.438920,4.915027,4.634447,4.061235,2.512517,3.173678,4.257048,1.880469,9.457238,2.633412
MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,14.600242,14.987912,...,11.878101,11.058811,7.491024,5.041533,7.797467,18.877911,6.605763,9.726565,8.805014,7.264584
MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,38.341323,28.166938,...,104.245961,95.143906,105.923083,121.837059,119.994358,80.436316,95.233090,128.390657,93.920152,109.422802
MBURv2_10003,Protein traN,CDS,,MBURv2,875,1594,,,88.169356,87.588448,...,100.544474,111.386799,95.223404,112.293156,106.991360,101.160979,87.535540,130.222494,96.463823,113.895062
MBURv2_10004,Protein TraM,CDS,traM,MBURv2,1631,2071,,,30.734416,31.051023,...,27.132933,25.879122,21.871753,23.870118,20.852184,29.469865,32.579446,22.258615,26.248659,26.513262
MBURv2_10005,Protein TraL,CDS,traL,MBURv2,2071,2796,,,20.413855,20.697056,...,23.224037,22.635527,23.842506,13.492699,13.496994,13.376658,19.789994,17.250585,15.475480,15.887526
MBURv2_10006,Protein TraK,CDS,traK,MBURv2,2796,3176,,,35.708220,25.126478,...,39.400048,36.049981,41.600548,61.398203,27.301369,33.735945,28.659651,39.978479,53.615835,34.835684
MBURv2_10007,Protein TraJ,CDS,traJ,MBURv2,3508,3876,,,24.619096,19.186300,...,21.225053,18.401381,22.889647,17.829813,10.213485,12.385084,8.306434,11.007625,17.069160,9.420335
MBURv2_10008,Protein TraI,CDS,traI,MBURv2,3911,6112,,,7.989934,6.768136,...,11.460757,10.476239,10.181261,13.876809,8.420698,9.988005,10.265632,9.837877,13.528746,11.624325


In [31]:
df3_TPM.describe()

Unnamed: 0,start_coord,end_coord,translation,5GB1_FM12_TR1,5GB1_FM12_TR2,5GB1_FM14_TR1,5GB1_FM14_TR2,5GB1_FM18_TR2,5GB1_FM18_TR3,5GB1_FM19_TR1,...,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1,5GB1_FM34_T0_TR1_QC,5GB1_FM34_T3_TR3_QC,5GB1_FM34_T4_TR3_QC,5GB1_FM34_T5_TR2_QC,5GB1_FM34_T6_TR3_QC,5GB1_FM34_T7_TR3_QC,5GB1_FM34_T8_TR1_QC
count,4410.0,4410.0,0.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,...,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0
mean,2523912.50771,2524848.262585,,226.699899,226.695346,226.697112,226.704348,226.594462,226.614222,226.653431,...,226.688681,226.691177,226.693393,226.703354,226.700279,226.705513,226.700812,226.707496,226.690055,226.701056
std,1471366.679738,1471365.070314,,2608.104118,2618.586261,2226.26175,2589.462563,1246.618739,1229.04351,1759.62361,...,1219.717871,1122.334667,1333.858585,1243.151858,1371.191334,1304.612055,1417.656243,1345.465894,1377.544663,1278.153592
min,116.0,289.0,,0.128012,0.130699,0.303466,0.201181,0.235129,0.955192,0.425168,...,0.296514,0.495039,0.37509,0.455466,0.23759,0.114161,0.298391,0.263617,0.493421,0.252605
25%,1228352.75,1230090.25,,23.920135,24.64132,25.954707,22.500866,43.295853,42.741254,26.266172,...,25.257989,26.315396,23.273639,25.932829,23.236787,24.509591,24.886934,24.752124,26.24504,25.814103
50%,2563815.0,2564586.5,,51.965185,53.158422,57.974121,50.919943,83.877066,82.371945,57.17522,...,56.834464,58.201296,54.005551,57.905411,53.720495,55.809776,58.072669,59.395058,60.59672,61.800981
75%,3796009.0,3796603.5,,109.742344,112.290835,123.777013,110.262018,162.001032,162.02061,120.513737,...,123.293527,129.071912,116.6561,124.025563,119.158968,125.158461,128.858717,130.436563,131.790463,134.177414
max,5066062.0,5067294.0,,102291.663448,105614.890872,95689.662164,109973.088035,58854.542337,64511.010158,70926.914449,...,45789.636731,34735.533389,55623.888672,34926.035642,57022.3298,52977.343258,61079.897898,57365.249426,58854.913465,52180.163782


# Log 2 transform the data

In [28]:
df4_log2 = log_2_transform(df3_TPM,'5GB1_FM12_TR1','5GB1_FM34_T8_TR1_QC')

In [32]:
df4_log2.describe() 

Unnamed: 0,5GB1_FM12_TR1,5GB1_FM12_TR2,5GB1_FM14_TR1,5GB1_FM14_TR2,5GB1_FM18_TR2,5GB1_FM18_TR3,5GB1_FM19_TR1,5GB1_FM19_TR1_UW,5GB1_FM19_TR3,5GB1_FM20_TR3,...,5GB1_FM40_T90m_TR2,5GB1_FM40_T150m_TR1_remake,5GB1_FM40_T180m_TR1,5GB1_FM34_T0_TR1_QC,5GB1_FM34_T3_TR3_QC,5GB1_FM34_T4_TR3_QC,5GB1_FM34_T5_TR2_QC,5GB1_FM34_T6_TR3_QC,5GB1_FM34_T7_TR3_QC,5GB1_FM34_T8_TR1_QC
count,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,...,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0,4410.0
mean,5.711915,5.741699,5.854974,5.685472,6.408118,6.397127,5.912669,5.895753,5.89965,6.172266,...,5.892627,5.939207,5.822109,5.891138,5.810324,5.867605,5.896775,5.8932,5.951252,5.960253
std,1.89929,1.892151,1.915438,1.948127,1.712715,1.718178,1.865792,1.855358,1.838577,1.877034,...,1.971689,1.973475,2.003664,1.960701,2.025366,2.019438,2.021813,2.034819,1.982845,1.99988
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.580154,4.623007,4.697924,4.491909,5.436157,5.417557,4.715134,4.732986,4.729698,4.98878,...,4.658668,4.717835,4.540624,4.696708,4.538339,4.615274,4.637316,4.62948,4.713973,4.690087
50%,5.699473,5.732226,5.857337,5.670159,6.390204,6.364081,5.837318,5.837747,5.842178,6.140328,...,5.828694,5.862979,5.755035,5.855626,5.747401,5.802446,5.859787,5.89227,5.921168,5.949558
75%,6.777976,6.811096,6.9516,6.78479,7.339859,7.340033,6.913054,6.893761,6.888171,7.209547,...,6.945953,7.012031,6.866118,6.954494,6.896744,6.967612,7.009646,7.027205,7.042102,7.067998
max,16.642329,16.688454,16.546075,16.746791,15.844866,15.977258,16.114046,16.57341,16.635053,15.420885,...,15.482733,15.084125,15.763417,15.092015,15.799239,15.693088,15.89841,15.807889,15.844875,15.671214


# Taking log2 ratio (fm23 is the baseline)

In [35]:
df5_log2_ratio = df4_log2.subtract(df4_log2['5GB1_FM23_TR3'], axis = "index")
df5_log2_ratio = 

In [39]:
df5_log2_ratio = df5_log2_ratio.drop(['5GB1_FM23_TR3'], axis = 1)

In [40]:
df5_log2_ratio.columns

Index(['5GB1_FM12_TR1', '5GB1_FM12_TR2', '5GB1_FM14_TR1', '5GB1_FM14_TR2',
       '5GB1_FM18_TR2', '5GB1_FM18_TR3', '5GB1_FM19_TR1', '5GB1_FM19_TR1_UW',
       '5GB1_FM19_TR3', '5GB1_FM20_TR3', '5GB1_FM20_TR3_UW', '5GB1_FM21_TR1',
       '5GB1_FM21_TR2', '5GB1_FM21_TR2_UW', '5GB1_FM22_TR1', '5GB1_FM22_TR3',
       '5GB1_FM22_TR3_UW', '5GB1_FM69_t2_TR1', '5GB1_FM69_t3_TR1',
       '5GB1_FM69_t3_TR1_UW', '5GB1_FM69_t4_TR1', '5GB1_FM69_t4_TR1_UW',
       '5GB1_FM80_t2_TR1', '5GB1_FM80_t4_TR1', '5GB1_FM81_t1_TR3',
       '5GB1_FM81_t2_TR3', '5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3',
       '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1', '5GB1_FM40_T60m_TR1',
       '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake',
       '5GB1_FM40_T180m_TR1', '5GB1_FM34_T0_TR1_QC', '5GB1_FM34_T3_TR3_QC',
       '5GB1_FM34_T4_TR3_QC', '5GB1_FM34_T5_TR2_QC', '5GB1_FM34_T6_TR3_QC',
       '5GB1_FM34_T7_TR3_QC', '5GB1_FM34_T8_TR1_QC'],
      dtype='object')