# Transcriptomic and Proteomic based flux predictions
This notebook integrates Transcriptomic measurements into flux profiles

### Imports

In [1]:
import pandas as pd
import cobra

### Load Transcriptomics Data

In [2]:
yoneda_data = pd.read_csv('../../EDD_Yoneda_data/Yoneda_set3_transcriptomics_data.csv')

# remove blank columns
yoneda_data = yoneda_data[[col for col in yoneda_data.columns if 'Unnamed' not in col]]

yoneda_data.head()

Unnamed: 0,Line Name,Measurement Type,Time,Value,Units
0,WT-G-R1,PD630_LPD06575,14,708,counts
1,WT-G-R1,PD630_LPD06576,14,6513,counts
2,WT-G-R1,PD630_LPD00131,14,1015,counts
3,WT-G-R1,PD630_LPD06740,14,289,counts
4,WT-G-R1,PD630_LPD06741,14,1159,counts


In [3]:
transcriptomics_genes = list(set(yoneda_data['Measurement Type']))
print(f'There are {len(transcriptomics_genes)} genes in the transcriptomics data set (LPD)')
transcriptomics_genes[:5]

There are 15685 genes in the transcriptomics data set (LPD)


['PD630_LPD06236',
 'PD630_LPD08532',
 'PD630_LPD11098',
 'PD630_LPD02859',
 'PD630_LPD01602']

### Load Model Data

In [4]:
model = cobra.io.read_sbml_model("../GSMs/Ropacus_annotated_curated_with_phenol_custom_biomass.xml")
model_genes = [g.id for g in model.genes]
print(f'There are {len(model_genes)} genes in the model (WP)')
model_genes[:5]

There are 1576 genes in the model (WP)


['WP_187300246_1',
 'WP_025432775_1',
 'WP_005248578_1',
 'WP_025433613_1',
 'WP_005249637_1']

### Load gene mapping data frame

In [5]:
gene_mapping_df = pd.read_csv('../gene_converter/LPD_RS_WP_gene_mapping.csv')
print(f'There are {len(gene_mapping_df)} mapped genes')
gene_mapping_df.head()

There are 7092 mapped genes


Unnamed: 0.1,Unnamed: 0,Gene ID (RS),Gene ID (LPD),Gene ID (WP),Annotation
0,1,PD630_RS00010,PD630_LPD00002,WP_005569241_1,DNA polymerase III subunit beta
1,2,PD630_RS00015,PD630_LPD00003,WP_007296166_1,6-phosphogluconate dehydrogenase
2,3,PD630_RS00020,PD630_LPD00004,WP_005237760_1,DNA replication and repair protein RecF
3,4,PD630_RS00025,PD630_LPD00005,WP_005237761_1,hypothetical protein
4,5,PD630_RS00030,PD630_LPD00006,WP_005256534_1,metal-dependent hydrolase


#### Remove rows that have unknown values for LPD or WP IDs

### Make gene mapping dictionary

In [7]:
WP_to_LPD_dictionary = {row['Gene ID (WP)']:row['Gene ID (LPD)'] for _, row in gene_mapping_df.iterrows()}

print(f'There are {len(WP_to_LPD_dictionary)} mapped genes')

for k in list(WP_to_LPD_dictionary.keys())[:5]:
    print(f'{k} → {WP_to_LPD_dictionary[k]}')

There are 6893 mapped genes
WP_005569241_1 → PD630_LPD00002
WP_007296166_1 → PD630_LPD00003
WP_005237760_1 → PD630_LPD00004
WP_005237761_1 → PD630_LPD00005
WP_005256534_1 → PD630_LPD00006


### Determine how many WP genes in the model have mappings to LPD

In [8]:
mapped_model_genes = [g for g in model_genes if g in WP_to_LPD_dictionary.keys()]
unmapped_model_genes = [g for g in model_genes if g not in WP_to_LPD_dictionary.keys()]

print(f'The model has {len(mapped_model_genes)} genes that map to LPD genes')
print(f'The model has {len(unmapped_model_genes)} genes that do NOT map to LPD genes')

The model has 196 genes that map to LPD genes
The model has 1380 genes that do NOT map to LPD genes


In [13]:
transcriptomic_mapped_model_genes = [g for g in mapped_model_genes if g in WP_to_LPD_dictionary.keys()]
print(f'The model has {len(transcriptomic_mapped_model_genes)} genes that map to LPD genes')

The model has 196 genes that map to LPD genes


In [12]:
# transcriptomic_mapped_model_genes

In [16]:
model_mapped_genes = [g for g in model_genes if g in mapped_model_genes]
print(len(model_mapped_genes))

196


In [21]:
transcriptomic_mapped_genes = [g for g in transcriptomics_genes if g in WP_to_LPD_dictionary.values()]
len(transcriptomic_mapped_genes)

6850

In [22]:
7092-6850

242

In [23]:
15685-6850

8835

### Make a dictionary where keys are measured gene names and the values are lists of  measured values and  add measured values to the lists

In [None]:
# Make dictionary of blank lists
genes = list(set(yoneda_data['Measurement Type']))
print(f'There are {len(genes)} measured genes')
gene_dictionary = {gene:[] for gene in genes}

# Add measured values to the list
conditions = list(set(yoneda_data['Line Name']))
for condition in conditions:
    print(condition)
    condition_df = yoneda_data[yoneda_data['Line Name'] == condition]
    for gene in genes:
        try:
            gene_value = float(condition_df[condition_df['Measurement Type'] == gene]['Value'])
            gene_dictionary[gene].append(gene_value)
        except:
            gene_dictionary[gene].append('none')

### Remove genes that do have multiple measurements for a single run

In [None]:
new_gene_dictionary = {}
for gene, values in gene_dictionary.items():
    if len(values) == 8:
        new_gene_dictionary[gene] = values
    else:
        print(len(values), gene, values)

### Convert the gene dictionary to a dataframe

In [None]:
transcriptomic_df = pd.DataFrame.from_dict(new_gene_dictionary, orient='index', columns=conditions)
transcriptomic_df.head()

### Load LPD to WP mapping

In [None]:
gene_mapping_df = pd.read_csv('../gene_converter/LPD_RS_WP_gene_mapping.csv')
print(f'There are {len(gene_mapping_df)} mapped genes')

# remove blank columns
gene_mapping_df = gene_mapping_df[[col for col in gene_mapping_df.columns if 'Unnamed' not in col]]

gene_mapping_df.head()

### Add WP annoation to transcriptomic data

In [None]:
# make dicitonary where keys are LPD IDs and values are WP IDs 
LPD_to_WP_dictionary = {row['Gene ID (LPD)']:row['Gene ID (WP)'] for _, row in gene_mapping_df.iterrows()}


not_matched = 0
WP_genes = []
for index, _ in transcriptomic_df.iterrows():
    try:
        WP_genes.append(LPD_to_WP_dictionary[index])
    except:
        WP_genes.append('Unknown')
        not_matched += 1

transcriptomic_df['WP_gene'] = WP_genes
print(f'There are {not_matched} genes that have an LPD measurement without a WP mapping')
transcriptomic_df.head()

### Check measured values for genes without known WP ids

In [None]:
unknown_df = transcriptomic_df[transcriptomic_df['WP_gene'] == 'Unknown']
unknown_df.head()

### Determine how many genes are in the genome scale model and how many of them have known mappings to LPD data

### Load model

In [None]:
model = cobra.io.read_sbml_model("../GSMs/Ropacus_annotated_curated_with_phenol_custom_biomass.xml")
model

### Get list of genes

In [None]:
model_genes = [g.id for g in model.genes]
print(f'The genome scale model has {len(model_genes)} genes')

### Determine how many genes have known LPD numbers

In [None]:
WP_to_LPD_dictionary = {v.replace('.', '_'):k for k,v in LPD_to_WP_dictionary.items()}

for k,v in list(WP_to_LPD_dictionary.items())[:5]:
    print(k, v)

In [None]:
model_genes[:10]

In [None]:
# Switch keys and values of the mapping dictionary
WP_to_LPD_dictionary = {v:k for k,v in LPD_to_WP_dictionary.items()}


mapped_model_genes = 0
for g in model_genes:
    try:
        WP_to_LPD_dictionary[g]
    except:
        mapped_model_genes += 1
        
print(mapped_model_genes)
#     print(WP_to_LPD_dictionary[g])
# known_LPD 
# for g in model.genes:
    

In [None]:
print(len([gene for gene in list(transcriptomic_df.index) if gene not in list(LPD_to_WP_dictionary.keys())]))

In [None]:
yoneda_ids = list(transcriptomic_df.index)
mapping_ids = list(LPD_to_WP_dictionary.keys())

print(yoneda_ids[:20])
print(mapping_ids[:20])

In [None]:
test = list(LPD_to_WP_dictionary.keys())
test.sort(reverse=True)
test

In [None]:
transcriptomic_df

### This is a rewrite of the above code that is more succinct, but takes a lot longer to run

In [None]:
# gene_dictionary_2 = {}

# for x, gene in enumerate(genes):
#     if x % 100 == 0:
#         print(x)
        
#     values = []
#     for condition in conditions:
        
#         row = yoneda_data[(yoneda_data['Measurement Type'] == gene) & (yoneda_data['Line Name']==condition)]
#         try:
#             values.append(float(row.Value[0]))
#         except:
#             values.append('no measurement')
            
            
            
# #             print(row.Value[0])
# #         print(row)
# #         print(yoneda_data[
# #                 (yoneda_data['Measurement Type'] == gene & yoneda_data['Line Name'] == condition)]['Value'])
        
# #     print()