# The conversion of files from Winston's format to EDD format
This notebook first analyzes the proper EDD format and the format of the transcriptomics data that Winston sent. <br>
Then it goes through in high detail how to convert the CPM .txt file into a properly formatted EDD .csv file <br>
Last it runs the other three .txt files (FPKM, MR, and TMM) through the same pipeline

In [1]:
import pandas as pd

## Analyze data that is in EDD_Henson_data folder

In [2]:
henson_files = ['../../EDD_Henson_data/Henson_Transcriptomics_1.csv',
                '../../EDD_Henson_data/Henson_Transcriptomics_2.csv',
                '../../EDD_Henson_data/Henson_Transcriptomics_3.csv',
                '../../EDD_Henson_data/Henson_Transcriptomics_4.csv',
                '../../EDD_Henson_data/Henson_Transcriptomics_5.csv',
                '../../EDD_Henson_data/Henson_Transcriptomics_6.csv']

In [3]:
for file in henson_files:    
    old_henson_data = pd.read_csv(file)
    print(f'This Henson data frame in EDD has {len(old_henson_data)} rows')
    print(f'It contains the line names: {set(old_henson_data["Line Name"])}')
    print()

This Henson data frame in EDD has 8283 rows
It contains the line names: {nan, 'WT-P-R1'}

This Henson data frame in EDD has 8283 rows
It contains the line names: {nan, 'WT-P-R2'}

This Henson data frame in EDD has 8283 rows
It contains the line names: {nan, 'WT-P-R3'}

This Henson data frame in EDD has 8283 rows
It contains the line names: {nan, 'PVHG6-P-R1'}

This Henson data frame in EDD has 8283 rows
It contains the line names: {nan, 'PVHG6-P-R2'}

This Henson data frame in EDD has 8283 rows
It contains the line names: {nan, 'PVHG6-P-R3'}



## Analyze Henson Data from Winston

In [4]:
henson_data = pd.read_table('../winston_data/henson/henson_reprocess_CPM_melted.txt', delim_whitespace=True)
print(f'This data has {len(henson_data)} rows')
henson_data.head()

This data has 440385 rows


Unnamed: 0,Strain,Units,Media,Time.point,variable,Replicate,value
1,WT,CPM,Mixture,t=1,WP_000104864.1,1,0.0
2,WT,CPM,Mixture,t=1,WP_000104864.1,2,0.0
3,WT,CPM,Mixture,t=1,WP_000104864.1,3,0.0
4,WT,CPM,Mixture,t=2,WP_000104864.1,1,0.0
5,WT,CPM,Mixture,t=2,WP_000104864.1,2,0.0


In [5]:
print(f'The strains in this data are: {set(henson_data.Strain)}')
print(f'The media in this data are: {set(henson_data.Media)}')
print(f'The time points in this data are: {set(henson_data["Time.point"])}')
print(f'The replicates in this data are: {set(henson_data.Replicate)}')

The strains in this data are: {'PVHG', 'WT'}
The media in this data are: {'Mixture', 'glucose', 'vanillic acid', '4-hydroxybenzoic acid', 'sodium benzoate', 'phenol', 'guaiacol'}
The time points in this data are: {'t=1', 't=2'}
The replicates in this data are: {1, 2, 3}


Define a function to take in a row and return its line name

In [6]:
def row_to_line_name(row):
    line_name = row.Strain
    
    if row.Media == 'sodium benzoate':
        line_name += '-B'
    if row.Media == 'glucose':
        line_name += '-Glu'
    if row.Media == 'phenol':
        line_name += '-P'
    if row.Media == 'Mixture':
        line_name += '-M'
    if row.Media == 'guaiacol':
        line_name += '-Gua'
    if row.Media == 'vanillic acid':
        line_name += '-V'
    if row.Media == '4-hydroxybenzoic acid':
        line_name += '-H'
      
    if row["Time.point"] == 't=1':
        line_name += '-T1'
    if row["Time.point"] == 't=2':
        line_name += '-T2'
        
    line_name += '-R' + str(row.Replicate)
        
    return line_name

In [7]:
henson_data['Line Name'] = [row_to_line_name(row) for _, row in henson_data.iterrows()]
henson_data.head()

Unnamed: 0,Strain,Units,Media,Time.point,variable,Replicate,value,Line Name
1,WT,CPM,Mixture,t=1,WP_000104864.1,1,0.0,WT-M-T1-R1
2,WT,CPM,Mixture,t=1,WP_000104864.1,2,0.0,WT-M-T1-R2
3,WT,CPM,Mixture,t=1,WP_000104864.1,3,0.0,WT-M-T1-R3
4,WT,CPM,Mixture,t=2,WP_000104864.1,1,0.0,WT-M-T2-R1
5,WT,CPM,Mixture,t=2,WP_000104864.1,2,0.0,WT-M-T2-R2


In [8]:
henson_data['Measurement Type'] = [row.variable.replace('.', '_') for _, row in henson_data.iterrows()]
henson_data.head()

Unnamed: 0,Strain,Units,Media,Time.point,variable,Replicate,value,Line Name,Measurement Type
1,WT,CPM,Mixture,t=1,WP_000104864.1,1,0.0,WT-M-T1-R1,WP_000104864_1
2,WT,CPM,Mixture,t=1,WP_000104864.1,2,0.0,WT-M-T1-R2,WP_000104864_1
3,WT,CPM,Mixture,t=1,WP_000104864.1,3,0.0,WT-M-T1-R3,WP_000104864_1
4,WT,CPM,Mixture,t=2,WP_000104864.1,1,0.0,WT-M-T2-R1,WP_000104864_1
5,WT,CPM,Mixture,t=2,WP_000104864.1,2,0.0,WT-M-T2-R2,WP_000104864_1


In [9]:
henson_data['Time'] = [row['Time.point'].split('=')[1] for _, row in henson_data.iterrows()]
henson_data.head()

Unnamed: 0,Strain,Units,Media,Time.point,variable,Replicate,value,Line Name,Measurement Type,Time
1,WT,CPM,Mixture,t=1,WP_000104864.1,1,0.0,WT-M-T1-R1,WP_000104864_1,1
2,WT,CPM,Mixture,t=1,WP_000104864.1,2,0.0,WT-M-T1-R2,WP_000104864_1,1
3,WT,CPM,Mixture,t=1,WP_000104864.1,3,0.0,WT-M-T1-R3,WP_000104864_1,1
4,WT,CPM,Mixture,t=2,WP_000104864.1,1,0.0,WT-M-T2-R1,WP_000104864_1,2
5,WT,CPM,Mixture,t=2,WP_000104864.1,2,0.0,WT-M-T2-R2,WP_000104864_1,2


In [10]:
henson_data['Value'] = [row['value'] for _, row in henson_data.iterrows()]
henson_data.head()

Unnamed: 0,Strain,Units,Media,Time.point,variable,Replicate,value,Line Name,Measurement Type,Time,Value
1,WT,CPM,Mixture,t=1,WP_000104864.1,1,0.0,WT-M-T1-R1,WP_000104864_1,1,0.0
2,WT,CPM,Mixture,t=1,WP_000104864.1,2,0.0,WT-M-T1-R2,WP_000104864_1,1,0.0
3,WT,CPM,Mixture,t=1,WP_000104864.1,3,0.0,WT-M-T1-R3,WP_000104864_1,1,0.0
4,WT,CPM,Mixture,t=2,WP_000104864.1,1,0.0,WT-M-T2-R1,WP_000104864_1,2,0.0
5,WT,CPM,Mixture,t=2,WP_000104864.1,2,0.0,WT-M-T2-R2,WP_000104864_1,2,0.0


# Properly organize columns

In [11]:
henson_data.drop(['Strain', 'Media', 'Time.point', 'variable', 'Replicate', 'value'], axis=1, inplace=True)
henson_EDD_data = henson_data[['Line Name', 'Measurement Type', 'Time', 'Value', 'Units']]
henson_EDD_data.head()

Unnamed: 0,Line Name,Measurement Type,Time,Value,Units
1,WT-M-T1-R1,WP_000104864_1,1,0.0,CPM
2,WT-M-T1-R2,WP_000104864_1,1,0.0,CPM
3,WT-M-T1-R3,WP_000104864_1,1,0.0,CPM
4,WT-M-T2-R1,WP_000104864_1,2,0.0,CPM
5,WT-M-T2-R2,WP_000104864_1,2,0.0,CPM


# Define function to convert all 4 versions of Henson data to EDD compatible csvs

In [12]:
def winston_henson_txt_file_to_EDD_csv(input_file_name, output_file_name):
    henson_data = pd.read_table(input_file_name, delim_whitespace=True)
    
    henson_data['Line Name'] = [row_to_line_name(row) for _, row in henson_data.iterrows()]
    henson_data['Measurement Type'] = [row.variable.replace('.', '_') for _, row in henson_data.iterrows()]
    henson_data['Time'] = [row['Time.point'].split('=')[1] for _, row in henson_data.iterrows()]
    henson_data['Value'] = [row['value'] for _, row in henson_data.iterrows()]
    
    henson_data.drop(['Strain', 'Media', 'Time.point', 'variable', 'Replicate', 'value'], axis=1, inplace=True)
    EDD_data = henson_data[['Line Name', 'Measurement Type', 'Time', 'Value', 'Units']]
    
    EDD_data.to_csv(output_file_name, index=False)


### Run the function 4 times to convert the text files to EDD formatted csv files

In [13]:
winston_henson_txt_file_to_EDD_csv('../winston_data/henson/henson_reprocess_CPM_melted.txt', '../winston_data/henson/henson_reprocess_CPM_melted.csv')

In [14]:
winston_henson_txt_file_to_EDD_csv('../winston_data/henson/henson_reprocess_FKPM_melted.txt', '../winston_data/henson/henson_reprocess_FPKM_melted.csv')

  if (await self.run_code(code, result,  async_=asy)):


In [15]:
winston_henson_txt_file_to_EDD_csv('../winston_data/henson/henson_reprocess_MR_melted.txt', '../winston_data/henson/henson_reprocess_MR_melted.csv')

In [16]:
winston_henson_txt_file_to_EDD_csv('../winston_data/henson/henson_reprocess_TMM_melted.txt', '../winston_data/henson/henson_reprocess_TMM_melted.csv')