# Transcriptomic and Proteomic based flux predictions
This notebook integrates Transcriptomic measurements into flux profiles

### Imports

In [1]:
import pandas as pd
import cobra

### Load Transcriptomics Data

In [2]:
yoneda_data = pd.read_csv('../../EDD_Yoneda_data/Yoneda_set3_transcriptomics_data.csv')

# remove blank columns
yoneda_data = yoneda_data[[col for col in yoneda_data.columns if 'Unnamed' not in col]]

yoneda_data.head()

Unnamed: 0,Line Name,Measurement Type,Time,Value,Units
0,WT-G-R1,PD630_LPD06575,14,708,counts
1,WT-G-R1,PD630_LPD06576,14,6513,counts
2,WT-G-R1,PD630_LPD00131,14,1015,counts
3,WT-G-R1,PD630_LPD06740,14,289,counts
4,WT-G-R1,PD630_LPD06741,14,1159,counts


### Make a dictionary where keys are measured gene names and the values are lists of  measured values and  add measured values to the lists

In [14]:
genes = list(set(yoneda_data['Measurement Type']))
gene_dictionary = {gene:[] for gene in genes}

conditions = list(set(yoneda_data['Line Name']))

for condition in conditions:
    print(condition)
    condition_df = yoneda_data[yoneda_data['Line Name'] == condition]
#     print(condition_df.head())
    for gene in genes:
        try:
            gene_value = float(condition_df[condition_df['Measurement Type'] == gene]['Value'])
            gene_dictionary[gene].append(gene_value)
        except:
            gene_dictionary[gene].append('none')

EVOL40-L-R1
EVOL40-G-R1
WT-G-R1
EVOL33-G-R1
EVOL33-L-R1
EVOL40-H-R1
WT-L-R1
EVOL33-H-R1


### Remove genes that do have multiple measurements for a single run

In [5]:
new_gene_dictionary = {}
for gene, values in gene_dictionary.items():
    if len(values) == 8:
        new_gene_dictionary[gene] = values
    else:
        print(len(values), gene, values)

### Convert the gene dictionary to a dataframe

In [6]:
transcriptomic_df = pd.DataFrame.from_dict(new_gene_dictionary, orient='index', columns=conditions)
transcriptomic_df

Unnamed: 0,EVOL40-L-R1,EVOL40-G-R1,WT-G-R1,EVOL33-G-R1,EVOL33-L-R1,EVOL40-H-R1,WT-L-R1,EVOL33-H-R1
PD630_LPD15471,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD07867,56,39,21,25,31,5.39838e+06,44,38
PD630_LPD08755,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD14956,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD13951,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD02317,1816,590,273,348,1321,none,860,884
PD630_LPD10425,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD13694,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD05166,915,997,626,690,553,none,640,669
PD630_LPD02798,145,88,47,41,56,none,60,153


### Load LPD to WP mapping

In [17]:
gene_mapping_df = pd.read_csv('../gene_converter/LPD_RS_WP_gene_mapping.csv')

# remove blank columns
gene_mapping_df = gene_mapping_df[[col for col in gene_mapping_df.columns if 'Unnamed' not in col]]

gene_mapping_df.head()

Unnamed: 0,Gene ID (RS),Gene ID (LPD),Gene ID (WP),Annotation
0,PD630_RS00005,Pd630_LPD00001,none,chromosomal replication initiator protein DnaA
1,PD630_RS00010,Pd630_LPD00002,WP_005569241.1,DNA polymerase III subunit beta
2,PD630_RS00015,Pd630_LPD00003,WP_007296166.1,6-phosphogluconate dehydrogenase
3,PD630_RS00020,Pd630_LPD00004,WP_005237760.1,DNA replication and repair protein RecF
4,PD630_RS00025,Pd630_LPD00005,WP_005237761.1,hypothetical protein


### Add WP annoation to transcriptomic_df

In [28]:
# make dicitonary where keys are LPD IDs and values are WP IDs 
LPD_to_WP_dictionary = {row['Gene ID (LPD)']:row['Gene ID (WP)'] for _, row in gene_mapping_df.iterrows()}

# WP_genes = []
# for index, _ in transcriptomic_df.iterrows():
#     try:
#         WP_genes.append(LPD_to_WP_dictionary[index])
#     except:
#         WP_genes.append('Unknown')
#         print(f'{index} does not have a matching known WP Id')

# transcriptomic_df['WP_gene'] = WP_genes

# transcriptomic_df['WP_gene'] = [LPD_to_WP_dictionary[index] for index, _ in transcriptomic_df.iterrows()]

# transcriptomic_df

index = 'Pd630_LPD00003'
print(LPD_to_WP_dictionary[index])

WP_007296166.1


In [46]:
print(len([gene for gene in list(transcriptomic_df.index) if gene not in list(LPD_to_WP_dictionary.keys())]))

15685


In [48]:
yoneda_ids = list(transcriptomic_df.index)
mapping_ids = list(LPD_to_WP_dictionary.keys())

print(yoneda_ids[:20])
print(mapping_ids[:20])

['PD630_LPD15471', 'PD630_LPD07867', 'PD630_LPD08755', 'PD630_LPD14956', 'PD630_LPD13951', 'PD630_LPD02317', 'PD630_LPD10425', 'PD630_LPD13694', 'PD630_LPD05166', 'PD630_LPD02798', 'PD630_LPD10189', 'PD630_LPD12451', 'PD630_LPD02143', 'PD630_LPD06206', 'PD630_LPD12587', 'PD630_LPD07583', 'PD630_LPD04932', 'PD630_LPD13359', 'PD630_LPD08127', 'PD630_LPD02555']
['Pd630_LPD00001', 'Pd630_LPD00002', 'Pd630_LPD00003', 'Pd630_LPD00004', 'Pd630_LPD00005', 'Pd630_LPD00006', 'Pd630_LPD00007', 'Pd630_LPD00008', 'Pd630_LPD00009', 'Pd630_LPD00011', 'Pd630_LPD00012', 'Pd630_LPD00013', 'Pd630_LPD00014', 'Pd630_LPD00015', 'Pd630_LPD00016', 'Pd630_LPD00017', 'Pd630_LPD00018', 'Pd630_LPD00019', 'Pd630_LPD00020', 'Pd630_LPD00021']


# The problem is the capitalization of PD vs Pd

In [41]:
test = list(LPD_to_WP_dictionary.keys())
test.sort(reverse=True)
test

['none',
 'Pd630_LPD17037',
 'Pd630_LPD17036',
 'Pd630_LPD17035',
 'Pd630_LPD17033',
 'Pd630_LPD17032',
 'Pd630_LPD17031',
 'Pd630_LPD17030',
 'Pd630_LPD17029',
 'Pd630_LPD17028',
 'Pd630_LPD17027',
 'Pd630_LPD17025',
 'Pd630_LPD17024',
 'Pd630_LPD17023',
 'Pd630_LPD17021',
 'Pd630_LPD17019',
 'Pd630_LPD17018',
 'Pd630_LPD17016',
 'Pd630_LPD17014',
 'Pd630_LPD17013',
 'Pd630_LPD17012',
 'Pd630_LPD17011',
 'Pd630_LPD17009',
 'Pd630_LPD17005',
 'Pd630_LPD17003',
 'Pd630_LPD17001',
 'Pd630_LPD16195',
 'Pd630_LPD16194',
 'Pd630_LPD16193',
 'Pd630_LPD16192',
 'Pd630_LPD16191',
 'Pd630_LPD16190',
 'Pd630_LPD16189',
 'Pd630_LPD16188',
 'Pd630_LPD16186',
 'Pd630_LPD16179',
 'Pd630_LPD16178',
 'Pd630_LPD16176',
 'Pd630_LPD16174',
 'Pd630_LPD16173',
 'Pd630_LPD16172',
 'Pd630_LPD16171',
 'Pd630_LPD16170',
 'Pd630_LPD16169',
 'Pd630_LPD16165',
 'Pd630_LPD16161',
 'Pd630_LPD16159',
 'Pd630_LPD16157',
 'Pd630_LPD16155',
 'Pd630_LPD16154',
 'Pd630_LPD16153',
 'Pd630_LPD16150',
 'Pd630_LPD16149',
 'P

In [22]:
transcriptomic_df

Unnamed: 0,EVOL40-L-R1,EVOL40-G-R1,WT-G-R1,EVOL33-G-R1,EVOL33-L-R1,EVOL40-H-R1,WT-L-R1,EVOL33-H-R1
PD630_LPD15471,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD07867,56,39,21,25,31,5.39838e+06,44,38
PD630_LPD08755,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD14956,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD13951,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD02317,1816,590,273,348,1321,none,860,884
PD630_LPD10425,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD13694,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD05166,915,997,626,690,553,none,640,669
PD630_LPD02798,145,88,47,41,56,none,60,153


### This is a rewrite of the above code that is more succinct, but takes a lot longer to run

In [21]:
# gene_dictionary_2 = {}

# for x, gene in enumerate(genes):
#     if x % 100 == 0:
#         print(x)
        
#     values = []
#     for condition in conditions:
        
#         row = yoneda_data[(yoneda_data['Measurement Type'] == gene) & (yoneda_data['Line Name']==condition)]
#         try:
#             values.append(float(row.Value[0]))
#         except:
#             values.append('no measurement')
            
            
            
# #             print(row.Value[0])
# #         print(row)
# #         print(yoneda_data[
# #                 (yoneda_data['Measurement Type'] == gene & yoneda_data['Line Name'] == condition)]['Value'])
        
# #     print()