# Transcriptomic and Proteomic based flux predictions
This notebook integrates Transcriptomic measurements into flux profiles

### Imports

In [1]:
import pandas as pd
import cobra

### Load Transcriptomics Data

In [2]:
yoneda_data = pd.read_csv('../../EDD_Yoneda_data/Yoneda_set3_transcriptomics_data.csv')

# remove blank columns
yoneda_data = yoneda_data[[col for col in yoneda_data.columns if 'Unnamed' not in col]]

yoneda_data.head()

Unnamed: 0,Line Name,Measurement Type,Time,Value,Units
0,WT-G-R1,PD630_LPD06575,14,708,counts
1,WT-G-R1,PD630_LPD06576,14,6513,counts
2,WT-G-R1,PD630_LPD00131,14,1015,counts
3,WT-G-R1,PD630_LPD06740,14,289,counts
4,WT-G-R1,PD630_LPD06741,14,1159,counts


### Make a dictionary where keys are measured gene names and the values are lists of  measured values and  add measured values to the lists

In [3]:
# Make dictionary of blank lists
genes = list(set(yoneda_data['Measurement Type']))
print(f'There are {len(genes)} measured genes')
gene_dictionary = {gene:[] for gene in genes}

# Add measured values to the list
conditions = list(set(yoneda_data['Line Name']))
for condition in conditions:
    print(condition)
    condition_df = yoneda_data[yoneda_data['Line Name'] == condition]
    for gene in genes:
        try:
            gene_value = float(condition_df[condition_df['Measurement Type'] == gene]['Value'])
            gene_dictionary[gene].append(gene_value)
        except:
            gene_dictionary[gene].append('none')

There are 15685 measured genes
WT-L-R1
EVOL33-G-R1
EVOL33-L-R1
EVOL40-G-R1
WT-G-R1
EVOL40-H-R1
EVOL33-H-R1
EVOL40-L-R1


### Remove genes that do have multiple measurements for a single run

In [4]:
new_gene_dictionary = {}
for gene, values in gene_dictionary.items():
    if len(values) == 8:
        new_gene_dictionary[gene] = values
    else:
        print(len(values), gene, values)

### Convert the gene dictionary to a dataframe

In [5]:
transcriptomic_df = pd.DataFrame.from_dict(new_gene_dictionary, orient='index', columns=conditions)
transcriptomic_df.head()

Unnamed: 0,WT-L-R1,EVOL33-G-R1,EVOL33-L-R1,EVOL40-G-R1,WT-G-R1,EVOL40-H-R1,EVOL33-H-R1,EVOL40-L-R1
PD630_LPD07585,55,74,56,363,79,5.39838e+06,92,336
PD630_LPD09981,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD04716,5,8,17,13,9,none,284,15
PD630_LPD12853,none,none,none,none,none,5.39838e+06,none,none
PD630_LPD04787,579,269,411,437,334,none,513,694


### Load LPD to WP mapping

In [13]:
gene_mapping_df = pd.read_csv('../gene_converter/LPD_RS_WP_gene_mapping.csv')
print(f'There are {len(gene_mapping_df)} mapped genes')

# remove blank columns
gene_mapping_df = gene_mapping_df[[col for col in gene_mapping_df.columns if 'Unnamed' not in col]]

gene_mapping_df.head()

There are 8390 mapped genes


Unnamed: 0,Gene ID (RS),Gene ID (LPD),Gene ID (WP),Annotation
0,PD630_RS00005,PD630_LPD00001,none,chromosomal replication initiator protein DnaA
1,PD630_RS00010,PD630_LPD00002,WP_005569241.1,DNA polymerase III subunit beta
2,PD630_RS00015,PD630_LPD00003,WP_007296166.1,6-phosphogluconate dehydrogenase
3,PD630_RS00020,PD630_LPD00004,WP_005237760.1,DNA replication and repair protein RecF
4,PD630_RS00025,PD630_LPD00005,WP_005237761.1,hypothetical protein


# Why are there only  8390 mapped genes when there are 15685 measured genes

### Add WP annoation to transcriptomic data

In [24]:
# make dicitonary where keys are LPD IDs and values are WP IDs 
LPD_to_WP_dictionary = {row['Gene ID (LPD)']:row['Gene ID (WP)'] for _, row in gene_mapping_df.iterrows()}


not_matched = 0
WP_genes = []
for index, _ in transcriptomic_df.iterrows():
    try:
        WP_genes.append(LPD_to_WP_dictionary[index])
    except:
        WP_genes.append('Unknown')
        not_matched += 1

transcriptomic_df['WP_gene'] = WP_genes
print(f'There are {not_matched} genes that have an LPD measurement without a WP mapping')
transcriptomic_df.head()

There are 8002 genes that have an LPD measurement without a WP mapping


Unnamed: 0,WT-L-R1,EVOL33-G-R1,EVOL33-L-R1,EVOL40-G-R1,WT-G-R1,EVOL40-H-R1,EVOL33-H-R1,EVOL40-L-R1,WP_gene
PD630_LPD07585,55,74,56,363,79,5.39838e+06,92,336,WP_009475287.1
PD630_LPD09981,none,none,none,none,none,5.39838e+06,none,none,Unknown
PD630_LPD04716,5,8,17,13,9,none,284,15,WP_007298935.1
PD630_LPD12853,none,none,none,none,none,5.39838e+06,none,none,Unknown
PD630_LPD04787,579,269,411,437,334,none,513,694,WP_011593468.1


### Check measured values for genes without known WP ids

In [23]:
unknown_df = transcriptomic_df[transcriptomic_df['WP_gene'] == 'Unknown']
unknown_df.head()

Unnamed: 0,WT-L-R1,EVOL33-G-R1,EVOL33-L-R1,EVOL40-G-R1,WT-G-R1,EVOL40-H-R1,EVOL33-H-R1,EVOL40-L-R1,WP_gene
PD630_LPD09981,none,none,none,none,none,5398380.0,none,none,Unknown
PD630_LPD12853,none,none,none,none,none,5398380.0,none,none,Unknown
PD630_LPD09197,92,32,53,52,38,5398380.0,111,174,Unknown
PD630_LPD12755,none,none,none,none,none,5398380.0,none,none,Unknown
PD630_LPD11317,none,none,none,none,none,5398380.0,none,none,Unknown


### Determine how many genes are in the genome scale model and how many of them have known mappings to LPD data

### Load model

In [18]:
model = cobra.io.read_sbml_model("../GSMs/Ropacus_annotated_curated_with_phenol_custom_biomass.xml")
model

0,1
Name,ropacus_annotated_curated
Memory address,0x07f8745482510
Number of metabolites,1583
Number of reactions,2385
Number of groups,0
Objective expression,1.0*Growth - 1.0*Growth_reverse_699ae
Compartments,"cytosol, periplasm, extracellular space"


### Get list of genes

In [26]:
model_genes = [g.id for g in model.genes]
print(f'The genome scale model has {len(model_genes)} genes')

The genome scale model has 1576 genes


### Determine how many genes have known LPD numbers

In [36]:
WP_to_LPD_dictionary = {v.replace('.', '_'):k for k,v in LPD_to_WP_dictionary.items()}

for k,v in list(WP_to_LPD_dictionary.items())[:5]:
    print(k, v)

none PD630_LPD17031
WP_005569241_1 PD630_LPD00002
WP_007296166_1 PD630_LPD00003
WP_005237760_1 PD630_LPD00004
WP_005237761_1 PD630_LPD00005


In [38]:
model_genes[:10]

['WP_187300246_1',
 'WP_025432775_1',
 'WP_005248578_1',
 'WP_025433613_1',
 'WP_005249637_1',
 'WP_005248999_1',
 'WP_025433301_1',
 'WP_005246696_1',
 'WP_005244822_1',
 'WP_005250095_1']

In [37]:
# Switch keys and values of the mapping dictionary
WP_to_LPD_dictionary = {v:k for k,v in LPD_to_WP_dictionary.items()}


mapped_model_genes = 0
for g in model_genes:
    try:
        WP_to_LPD_dictionary[g]
    except:
        mapped_model_genes += 1
        
print(mapped_model_genes)
#     print(WP_to_LPD_dictionary[g])
# known_LPD 
# for g in model.genes:
    

1576


In [8]:
print(len([gene for gene in list(transcriptomic_df.index) if gene not in list(LPD_to_WP_dictionary.keys())]))

8002


In [9]:
yoneda_ids = list(transcriptomic_df.index)
mapping_ids = list(LPD_to_WP_dictionary.keys())

print(yoneda_ids[:20])
print(mapping_ids[:20])

['PD630_LPD07585', 'PD630_LPD09981', 'PD630_LPD04716', 'PD630_LPD12853', 'PD630_LPD04787', 'PD630_LPD09197', 'PD630_LPD02409', 'PD630_LPD04348', 'PD630_LPD14036', 'PD630_LPD00157', 'PD630_LPD04430', 'PD630_LPD03302', 'PD630_LPD12755', 'PD630_LPD11317', 'PD630_LPD08485', 'PD630_LPD08255', 'PD630_LPD03078', 'PD630_LPD16010', 'PD630_LPD10214', 'PD630_LPD11102']
['PD630_LPD00001', 'PD630_LPD00002', 'PD630_LPD00003', 'PD630_LPD00004', 'PD630_LPD00005', 'PD630_LPD00006', 'PD630_LPD00007', 'PD630_LPD00008', 'PD630_LPD00009', 'PD630_LPD00011', 'PD630_LPD00012', 'PD630_LPD00013', 'PD630_LPD00014', 'PD630_LPD00015', 'PD630_LPD00016', 'PD630_LPD00017', 'PD630_LPD00018', 'PD630_LPD00019', 'PD630_LPD00020', 'PD630_LPD00021']


In [10]:
test = list(LPD_to_WP_dictionary.keys())
test.sort(reverse=True)
test

['none',
 'PD630_LPD17037',
 'PD630_LPD17036',
 'PD630_LPD17035',
 'PD630_LPD17033',
 'PD630_LPD17032',
 'PD630_LPD17031',
 'PD630_LPD17030',
 'PD630_LPD17029',
 'PD630_LPD17028',
 'PD630_LPD17027',
 'PD630_LPD17025',
 'PD630_LPD17024',
 'PD630_LPD17023',
 'PD630_LPD17021',
 'PD630_LPD17019',
 'PD630_LPD17018',
 'PD630_LPD17016',
 'PD630_LPD17014',
 'PD630_LPD17013',
 'PD630_LPD17012',
 'PD630_LPD17011',
 'PD630_LPD17009',
 'PD630_LPD17005',
 'PD630_LPD17003',
 'PD630_LPD17001',
 'PD630_LPD16195',
 'PD630_LPD16194',
 'PD630_LPD16193',
 'PD630_LPD16192',
 'PD630_LPD16191',
 'PD630_LPD16190',
 'PD630_LPD16189',
 'PD630_LPD16188',
 'PD630_LPD16186',
 'PD630_LPD16179',
 'PD630_LPD16178',
 'PD630_LPD16176',
 'PD630_LPD16174',
 'PD630_LPD16173',
 'PD630_LPD16172',
 'PD630_LPD16171',
 'PD630_LPD16170',
 'PD630_LPD16169',
 'PD630_LPD16165',
 'PD630_LPD16161',
 'PD630_LPD16159',
 'PD630_LPD16157',
 'PD630_LPD16155',
 'PD630_LPD16154',
 'PD630_LPD16153',
 'PD630_LPD16150',
 'PD630_LPD16149',
 'P

In [11]:
transcriptomic_df

Unnamed: 0,WT-L-R1,EVOL33-G-R1,EVOL33-L-R1,EVOL40-G-R1,WT-G-R1,EVOL40-H-R1,EVOL33-H-R1,EVOL40-L-R1,WP_gene
PD630_LPD07585,55,74,56,363,79,5.39838e+06,92,336,WP_009475287.1
PD630_LPD09981,none,none,none,none,none,5.39838e+06,none,none,Unknown
PD630_LPD04716,5,8,17,13,9,none,284,15,WP_007298935.1
PD630_LPD12853,none,none,none,none,none,5.39838e+06,none,none,Unknown
PD630_LPD04787,579,269,411,437,334,none,513,694,WP_011593468.1
PD630_LPD09197,92,32,53,52,38,5.39838e+06,111,174,Unknown
PD630_LPD02409,11,11,8,17,8,none,11,14,WP_009479238.1
PD630_LPD04348,13,48,49,79,32,none,81,41,WP_016879889.1
PD630_LPD14036,102,136,85,196,119,5.39838e+06,108,188,WP_009474407.1
PD630_LPD00157,1374,88,143,179,167,none,321,386,WP_005237927.1


### This is a rewrite of the above code that is more succinct, but takes a lot longer to run

In [12]:
# gene_dictionary_2 = {}

# for x, gene in enumerate(genes):
#     if x % 100 == 0:
#         print(x)
        
#     values = []
#     for condition in conditions:
        
#         row = yoneda_data[(yoneda_data['Measurement Type'] == gene) & (yoneda_data['Line Name']==condition)]
#         try:
#             values.append(float(row.Value[0]))
#         except:
#             values.append('no measurement')
            
            
            
# #             print(row.Value[0])
# #         print(row)
# #         print(yoneda_data[
# #                 (yoneda_data['Measurement Type'] == gene & yoneda_data['Line Name'] == condition)]['Value'])
        
# #     print()