# The conversion transcriptomic data from txt files to csvs in EDD format
This notebook first analyzes the proper EDD format and the format of the transcriptomics data that Winston sent. <br>
Then it goes through in high detail how to convert the CPM .txt file into a properly formatted EDD .csv file <br>
Last it runs the other three .txt files (FPKM, MR, and TMM) through the same pipeline

#### Imports

In [1]:
import pandas as pd

## Yoneda Data

#### Define functions to help with conversion

In [2]:
def yoneda_row_to_line_name(row):
    if row.Strain.startswith('W'):
        line_name = 'WT'
    if row.Strain.startswith('3'):
        line_name = 'EVOL33'
    if row.Strain.startswith('4'):
        line_name = 'EVOL40'
    
    if row.Condition == '1g/L_glucose':
        line_name += '-LN-G-R'
    if row.Condition == '0.75g/L_phenol':
        line_name += '-LN-LP-R'
    if row.Condition == '1.5g/L_phenol':
        line_name += '-LN-HP-R'
        
    line_name += str(row.Replicate)
    
    return line_name

In [3]:
# This mapping is based on the methods in the yoneda paper
def yoneda_line_name_to_time(line_name):
    if '-G-' in line_name:
        return 14
    if '-LP-' in line_name:
        return 24
    if '-HP-' in line_name:
        return 32

In [4]:
# this fuction takes in a text file, and returns a EDD formatted csv file
def yoneda_txt_file_to_EDD_csv(input_file_name, output_file_name):
    # read in data
    yoneda_data = pd.read_table(input_file_name, delim_whitespace=True)
    
    # add a new column to the data that contains all the row's line name
    yoneda_data['Line Name'] = [yoneda_row_to_line_name(row) for _, row in yoneda_data.iterrows()]
    # add a new column to the data that contains all the row's measurement type, the underscore is the gene format in the genome scale model
    yoneda_data['Measurement Type'] = [row.variable.replace('.', '_') for _, row in yoneda_data.iterrows()]
    # add a new column to the data that contains when the data was collected
    yoneda_data['Time'] = [yoneda_line_name_to_time(row['Line Name']) for _, row in yoneda_data.iterrows()]
    # add a new column that contains the transcript level measured
    yoneda_data['Value'] = [row['value'] for _, row in yoneda_data.iterrows()]
    
    # remove typo from data to make EDD compatible
    if 'FPKM' in input_file_name:
        print('yes')
        yoneda_data['Units'] = ['FPKM'] * len(yoneda_data)
        

    # drop old columns
    yoneda_data.drop(['Strain','variable', 'Condition', 'Replicate'], axis=1, inplace=True)
    # reorder new columns
    EDD_data = yoneda_data[['Line Name', 'Measurement Type', 'Time', 'Value', 'Units']]
    
    # save the data to file
    EDD_data.to_csv(output_file_name, index=False)

#### Run the fuction for all normalization methods

In [5]:
yoneda_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_CPM_melted.txt', 
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_CPM_melted.csv'
)
yoneda_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_FPKM_melted.txt', 
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_FPKM_melted.csv'
)
yoneda_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_MR_melted.txt', 
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_MR_melted.csv'
)
yoneda_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_TMM_melted.txt', 
    '../../winston_data/october_19_2021/yoneda/yoneda_reprocess_TMM_melted.csv'
)

yes


## Henson Data

#### Define functions to help with conversion

In [6]:
def henson_row_to_line_name(row):
    line_name = row.Strain
    
    if row.Media == 'sodium benzoate':
        line_name += '-B'
    if row.Media == 'glucose':
        line_name += '-Glu'
    if row.Media == 'phenol':
        line_name += '-P'
    if row.Media == 'Mixture':
        line_name += '-M'
    if row.Media == 'guaiacol':
        line_name += '-Gua'
    if row.Media == 'vanillic acid':
        line_name += '-V'
    if row.Media == '4-hydroxybenzoic acid':
        line_name += '-H'
        
    line_name += '-R' + str(row.Replicate)
        
    return line_name

In [7]:
# This mapping is based on the methods in the Henson paper
def henson_row_to_time_value(row):
    if row['Media'] == 'glucose' and row['Time.point'] == 't=1':
        return 10
    elif row['Media'] == 'glucose' and row['Time.point'] == 't=2':
        return 13
    elif row['Media'] == 'Mixture' and row['Time.point'] == 't=1':
        return 20
    elif row['Media'] == 'Mixture' and row['Time.point'] == 't=2':
        return 32
    elif row['Media'] == 'phenol' and row['Strain'] == 'WT':
        return 24
    elif row['Media'] == 'phenol' and row['Strain'] == 'PVHG':
        return 21
    elif row['Media'] == 'guaiacol':
        return 19
    elif row['Media'] == '4-hydroxybenzoic acid':
        return 11
    elif row['Media'] == 'sodium benzoate':
        return 12
    elif row['Media'] == 'vanillic acid':
        return 24
    else:
        print(f'No time data for {row.Strain} {row.Media} {row["Time.point"]}')

In [8]:
def henson_txt_file_to_EDD_csv(input_file_name, output_file_name):
    henson_data = pd.read_table(input_file_name, delim_whitespace=True)
    print(f'total length of dataframe is {len(henson_data)}')
    
    henson_data = henson_data[henson_data['variable'] != 'Test']
    
    henson_data = henson_data.reset_index(drop=True)
    print(henson_data.head())
    
    # Avoid duplication of replicate 1, change one of the replicate values to 4
    # The [:-1] takes a slice of the dataframe without the last row. This prevents a key error
#     for index, row in henson_data[:-1].iterrows():
#         if row.Media == '4-hydroxybenzoic acid' and row.Strain == 'WT' and row.Replicate == 1:
#             if henson_data.loc[index + 1].Media != '4-hydroxybenzoic acid':
#                 henson_data.loc[index, 'Replicate'] = 4
                
    # The last row is excluded from the loop, but needs to have its replicate value changed
#     henson_data.at[len(henson_data)-1, 'Replicate'] = 4
#     print(henson_data.iloc[len(henson_data)-1, :])
    
    # Define columns needed for EDD
    henson_data['Line Name'] = [henson_row_to_line_name(row) for _, row in henson_data.iterrows()]
    henson_data['Measurement Type'] = [row.variable.replace('.', '_') for _, row in henson_data.iterrows()]
    henson_data['Time'] = [henson_row_to_time_value(row) for _, row in henson_data.iterrows()]
    henson_data['Value'] = [row['value'] for _, row in henson_data.iterrows()]
    
    # remove typo from data to make EDD compatible
    if 'FPKM' in input_file_name:
        henson_data['Units'] = ['FPKM'] * len(henson_data)
    
    henson_data.drop(['Strain', 'Media', 'Time.point', 'variable', 'Replicate', 'value'], axis=1, inplace=True)
    EDD_data = henson_data[['Line Name', 'Measurement Type', 'Time', 'Value', 'Units']]
    
    EDD_data.to_csv(output_file_name, index=False)


#### Run the fuction for all normalization methods

In [9]:
# henson_txt_file_to_EDD_csv(
#     '../../winston_data/october_19_2021/henson/henson_reprocess_CPM_melted.txt', 
#     '../../winston_data/october_19_2021/henson/henson_reprocess_CPM_melted.csv'
# )
henson_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/henson/henson_reprocess_FPKM_melted.txt', 
    '../../winston_data/october_19_2021/henson/henson_reprocess_FPKM_melted.csv'
)
henson_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/henson/henson_reprocess_MR_melted.txt', 
    '../../winston_data/october_19_2021/henson/henson_reprocess_MR_melted.csv'
)
henson_txt_file_to_EDD_csv(
    '../../winston_data/october_19_2021/henson/henson_reprocess_TMM_melted.txt', 
    '../../winston_data/october_19_2021/henson/henson_reprocess_TMM_melted.csv'
)

  if (await self.run_code(code, result,  async_=asy)):


total length of dataframe is 432432
  Strain Units    Media Time.point        variable  Replicate value
0     WT  FKPM  Mixture        t=1  WP_000104864.1          1     0
1     WT  FKPM  Mixture        t=1  WP_000104864.1          2     0
2     WT  FKPM  Mixture        t=1  WP_000104864.1          3     0
3     WT  FKPM  Mixture        t=2  WP_000104864.1          1     0
4     WT  FKPM  Mixture        t=2  WP_000104864.1          2     0
total length of dataframe is 432432
  Strain Units    Media Time.point        variable  Replicate value
0     WT    MR  Mixture        t=1  WP_000104864.1          1     0
1     WT    MR  Mixture        t=1  WP_000104864.1          2     0
2     WT    MR  Mixture        t=1  WP_000104864.1          3     0
3     WT    MR  Mixture        t=2  WP_000104864.1          1     0
4     WT    MR  Mixture        t=2  WP_000104864.1          2     0
total length of dataframe is 422172
  Strain Units    Media Time.point        variable  Replicate  \
0     WT   