In [1]:
import os.path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances

import re #regular expression matching for removing unwanted columns by name 
import natsort as ns #3rd party package for natural sorting 

In [47]:
def raw_data_cleanup(filename):
    
    """
    Imports RNAseq .tsv file and does basic clean up"
        -enter filename with extension as str
        -Several columns are empty, and several transcriptomics datasets are not appropriate for this analysis. 
        -sorts the data columns naturally
        -returns dataframe with locus tag set as index
    """
#identifying path to raw_data directory and checking for data file 

    PATH = "./raw_data/"+str(filename)
    
    if os.path.isfile(PATH):
        print("{} was located in the raw_data directory".format(filename))

#import the data
        df0_raw = pd.read_csv(PATH, sep = "\t") 
        print("{} was imported into dataframe".format(filename))
        
#identified columns to be removed. Dropping these columns and reorganizing the dataframe
        to_remove = ["5GB1_ferm_WT_QC", "5GB1_ferm_Ack_QC", "5GB1C_latelog_vial_TR2_QC", 
             "5GB1_FM58_Td20h_TR1_QC", "5GB1_FM58_Td32h_TR3_QC", "5GB1_LTrecycle_TR1_QC", "5GB1_LTrecycle_TR1",
             "5GB1_vial_wLa_TR3", "5GB1_vial_woLa_TR2"]        

        #manually removing some unwanted columns 
        df1_raw_filtered = df0_raw.drop(df0_raw.loc[:,to_remove].columns, axis = 1)
        #Isolating FM34: Cu transition 3+ hours to be reinserted later 
        df1a_FM34_only = df1_raw_filtered.select(lambda x: re.search("FM34", x), axis=1)
        #Removing all QC runs 
        df1_raw_filtered = df1_raw_filtered.select(lambda x: not re.search("QC", x), axis=1)

        
        #naturally sorting the filtered columns
        to_sort = df1_raw_filtered.loc[:,"5GB1_FM69_t2_TR1":]

        cols = list(ns.natsorted(to_sort.columns))
        cols_sorted = to_sort[cols]

        #adding descriptive columns
        qualitative = df1_raw_filtered.loc[:, "locus_tag":"translation"]
        df1b_filtered_sorted = pd.concat([qualitative, cols_sorted], axis=1)

        #isolating FM40 (to be added back later for particular order )
        df1b_FM40_only = df1b_filtered_sorted.select(lambda x: re.search("FM40", x), axis=1)

        #removing FM40 (to be added right back at the end)
        df1b_filtered_sorted = df1b_filtered_sorted.select(lambda x: not re.search("FM40", x), axis=1)

        #Adding FM40
        cleaned_up = pd.concat([df1b_filtered_sorted, df1b_FM40_only], axis=1)

        #adding FM34
        df2_cleaned_up = pd.concat([cleaned_up, df1a_FM34_only], axis=1)

        #setting locus tag as index
        df2_cleaned_up = df2_cleaned_up.set_index("locus_tag")    
        
        #new column names in the same order as df2_cleaned_up (except for FM23)
        df_new_columns = pd.DataFrame({"New_columns":
                                            ['FM12_CH4-lim_3.0/day',
                                             'FM12_CH4-lim_3.0/day_R1',
                                             'FM14_CH4-lim_3.0/day',
                                             'FM14_CH4-lim_3.0/day_R1',
                                             'FM18_CH3OH_4.1/day',
                                             'FM18_CH3OH_4.1/day_R1',
                                             'FM19_O2-lim_3.5.day',
                                             'FM19_O2-lim_3.5.day_R1',
                                             'FM19_O2-lim_3.5.day_R2',
                                             'FM20_no-lim_5.2/day',
                                             'FM20_no-lim_5.2/day_R1',
                                             'FM21_no-lim_5.4/day',
                                             'FM21_no-lim_5.4/day_R1',
                                             'FM21_no-lim_5.4/day_R2',
                                             'FM22_O2-lim_4.2/day',
                                             'FM22_O2-lim_4.2/day_R1',
                                             'FM22_O2-lim_4.2/day_R2',
                                             '5GB1_FM23_TR3',
                                             'FM69_O2-lim+_0.7/day_t2',
                                             'FM69_?-lim_0.7/day_t3',
                                             'FM69_?-lim_0.7/day_t3_R1',
                                             'FM69_?-lim_0.7/day_t4',
                                             'FM69_?-lim_0.7/day_t4_R2',
                                             'FM80_O2-lim_0.7/day_t2',
                                             'FM80_O2-lim_0.7/day_t4',
                                             'FM81_O2-lim+_0.7/day_t1',
                                             'FM81_O2-lim_0.7/day_t2',
                                             'FM40_-Cu_2.9/day_t0m',
                                             'FM40_+Cu_2.9/day_t10m',
                                             'FM40_+Cu_2.9/day_t20m',
                                             'FM40_+Cu_2.9/day_t40m',
                                             'FM40_+Cu_2.9/day_t60m',
                                             'FM40_+Cu_2.9/day_t90m',
                                             'FM40_+Cu_2.9/day_t150m',
                                             'FM40_+Cu_2.9/day_t180m',
                                             'FM34_-Cu_2.7/day_t0',
                                             'FM34_+Cu_2.7/day_t180m',
                                             'FM34_+Cu_2.7/day_t240m',
                                             'FM34_+Cu_2.7/day_t300m',
                                             'FM34_+Cu_2.7/day_t360m',
                                             'FM34_+Cu_2.7/day_t420m',
                                             'FM34_+Cu_2.7/day_t480m']})
        
        #resetting the column names to be more descriptive 
        df2_cleaned_up.columns = list(df2_cleaned_up.iloc[:,:8].columns) + list(df_new_columns.New_columns)
        
        columns_ordered = ['product',
                     'type',
                     'gene_symbol',
                     'locus',
                     'start_coord',
                     'end_coord',
                     'note',
                     'translation',
                     '5GB1_FM23_TR3',
                     'FM18_CH3OH_4.1/day',
                     'FM18_CH3OH_4.1/day_R1',
                     'FM20_no-lim_5.2/day',
                     'FM20_no-lim_5.2/day_R1',
                     'FM21_no-lim_5.4/day',
                     'FM21_no-lim_5.4/day_R1',
                     'FM21_no-lim_5.4/day_R2',                   
                     'FM12_CH4-lim_3.0/day',
                     'FM12_CH4-lim_3.0/day_R1',
                     'FM14_CH4-lim_3.0/day',
                     'FM14_CH4-lim_3.0/day_R1',
                     'FM19_O2-lim_3.5.day',
                     'FM19_O2-lim_3.5.day_R1',
                     'FM19_O2-lim_3.5.day_R2',
                     'FM22_O2-lim_4.2/day',
                     'FM22_O2-lim_4.2/day_R1',
                     'FM22_O2-lim_4.2/day_R2',
                     'FM34_-Cu_2.7/day_t0',
                     'FM40_-Cu_2.9/day_t0m',
                     'FM40_+Cu_2.9/day_t10m',
                     'FM40_+Cu_2.9/day_t20m',
                     'FM40_+Cu_2.9/day_t40m',
                     'FM40_+Cu_2.9/day_t60m',
                     'FM40_+Cu_2.9/day_t90m',
                     'FM40_+Cu_2.9/day_t150m',
                     'FM40_+Cu_2.9/day_t180m',
                     'FM34_+Cu_2.7/day_t180m',
                     'FM34_+Cu_2.7/day_t240m',
                     'FM34_+Cu_2.7/day_t300m',
                     'FM34_+Cu_2.7/day_t360m',
                     'FM34_+Cu_2.7/day_t420m',
                     'FM34_+Cu_2.7/day_t480m',
                     'FM69_O2-lim+_0.7/day_t2',
                     'FM69_?-lim_0.7/day_t3',
                     'FM69_?-lim_0.7/day_t3_R1',
                     'FM69_?-lim_0.7/day_t4',
                     'FM69_?-lim_0.7/day_t4_R2',
                     'FM80_O2-lim_0.7/day_t2',
                     'FM80_O2-lim_0.7/day_t4',
                     'FM81_O2-lim+_0.7/day_t1',
                     'FM81_O2-lim_0.7/day_t2']
        
        df2_cleaned_up = df2_cleaned_up[columns_ordered]
        
        return df2_cleaned_up
        
    else:
        print("{} does not exist in raw_data directory. Function was not complete.".format(filename))
        return        
        
        

In [49]:
new = raw_data_cleanup("5G_counts.tsv")

5G_counts.tsv was located in the raw_data directory
5G_counts.tsv was imported into dataframe


In [53]:
new.head()

Unnamed: 0_level_0,product,type,gene_symbol,locus,start_coord,end_coord,note,translation,5GB1_FM23_TR3,FM18_CH3OH_4.1/day,...,FM34_+Cu_2.7/day_t480m,FM69_O2-lim+_0.7/day_t2,FM69_?-lim_0.7/day_t3,FM69_?-lim_0.7/day_t3_R1,FM69_?-lim_0.7/day_t4,FM69_?-lim_0.7/day_t4_R2,FM80_O2-lim_0.7/day_t2,FM80_O2-lim_0.7/day_t4,FM81_O2-lim+_0.7/day_t1,FM81_O2-lim_0.7/day_t2
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MBURv2_100001,conserved protein of unknown function,CDS,,MBURv2,1965161,1965952,Evidence 4 : Homologs of previously reported g...,,205,428,...,124,34,707,89,531,58,537,499,469,505
MBURv2_100002,conserved protein of unknown function,CDS,,MBURv2,1966190,1966369,Evidence 4 : Homologs of previously reported g...,,29,40,...,3,1,67,2,55,3,38,35,35,34
MBURv2_100003,protein of unknown function,CDS,,MBURv2,1966931,1967041,Evidence 5 : No homology to any previously rep...,,9,30,...,0,2,132,0,88,0,75,65,62,75
MBURv2_10001,protein of unknown function,CDS,,MBURv2,116,289,Evidence 5 : No homology to any previously rep...,,83,119,...,8,4,82,13,82,9,51,62,39,48
MBURv2_10002,KfrB,CDS,kfrB,MBURv2,497,844,,,311,422,...,241,71,863,88,1140,97,849,893,803,812
