In [None]:
!pip install mygene

Collecting mygene
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Downloading biothings_client-0.4.1-py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biothings-client, mygene
Successfully installed biothings-client-0.4.1 mygene-3.2.2


In [None]:
import pandas as pd
import pickle

df = pd.read_csv('update_df.csv')


genes = {'E': [], 'S': [], 'P': []}


for index, row in df.iterrows():
    phenotypes = row['Phenotype']
    gene = row['Gene']


    phenotype_list = phenotypes.split(',')

    for phenotype in phenotype_list:
        phenotype = phenotype.strip()
        if phenotype in genes:
            genes[phenotype].append(gene)

print(genes)
with open('gene_symbols_wshared.pkl', 'wb') as file:
  pickle.dump(genes, file)



endurance_only = set(genes["E"]) - set(genes["P"]) - set(genes["S"])
power_only = set(genes["P"]) - set(genes["E"]) - set(genes["S"])
strength_only = set(genes["S"]) - set(genes["E"]) - set(genes["P"])
shared_genes = set(genes["E"]) & set(genes["P"]) & set(genes["S"])



gene_symbols_dict = {
    'end only': endurance_only,
    "pow only": power_only,
    "str only": strength_only,
    "shared": shared_genes}
print(gene_symbols_dict)
with open('gene_symbols_dict.pkl', 'wb') as file:
  pickle.dump(gene_symbols_dict, file)

{'E': ['ACE', 'ACTN3', 'ADRB2', 'ADRB2', 'ADRB3', 'AGTR2', 'AQP1', 'AMPD1', 'BDKRB2', 'CDKN1A', 'CKM', 'COL5A1', 'FTO', 'GABPB1', 'GABPB1', 'GALNTL6', 'GSTP1', 'HFE', 'HIF1A', 'MCT1', 'MYBPC3', 'NFATC4', 'NOS3', 'PPARA', 'PPARGC1A', 'PPARGC1B', 'PPP3R1', 'PRDM1', 'RBFOX1', 'SIRT1', 'SPEG', 'TFAM', 'TRPM2', 'TSHR', 'UCP2', 'UCP3', 'VEGFA', 'VEGFR2'], 'S': ['ACE', 'ACTN3', 'CKM', 'GALNTL6', 'HIF1A', 'PPARA', 'PPARGC1A', 'AGT', 'CNTFR', 'CRTAC1', 'IGF1', 'IL6', 'MTHFR', 'PPARG', 'SLC39A8', 'ABHD17C', 'ACTG1', 'ADCY3', 'ADPGK', 'ALDH2', 'ANGPT2', 'AR', 'ARPP21', 'BCDIN3D', 'DHODH', 'GBE1', 'GBF1', 'GLIS3', 'ITPR1', 'KIF1B', 'LRPPRC', 'MLN', 'MMS22L', 'NPIPB6', 'PHACTR1', 'PLEKHB1', 'R3HDM1', 'RASGRF1', 'RMC1', 'TFAP2D', 'ZKSCAN5', 'ZNF608'], 'P': ['ACE', 'ACTN3', 'ADRB2', 'ADRB2', 'AGTR2', 'AMPD1', 'CDKN1A', 'CKM', 'GALNTL6', 'HIF1A', 'NOS3', 'PPARA', 'ACVR1B', 'ADAM15', 'AGRN', 'AGT', 'AKAP6', 'AUTS2', 'BDNF', 'CCT3', 'CNTFR', 'CPNE5', 'CRTAC1', 'CRTC1', 'E2F3', 'FHL2', 'GDF5', 'HSD17B14'

In [None]:
def get_entrez_ids(gene_symbols):

  import mygene

  mg = mygene.MyGeneInfo()

  result = mg.querymany(gene_symbols, scopes="symbol", fields="entrezgene", species="human")
  symbol_to_entrez = {r['query']: r.get('entrezgene') for r in result if 'entrezgene' in r}


  print("Symbol to Entrez mapping:")
  print(symbol_to_entrez)

  #genes not found are mapped manually
  gene_list_entrez = [symbol_to_entrez[s] for s in gene_symbols if s in symbol_to_entrez]
  manual_mappings = {
    "MCT1": 6566,
    "VEGFR2": 3791,
    'NRXN3A': 9369
  }


  symbol_to_entrez.update(manual_mappings)


  missing_symbols = [gene for gene in gene_symbols if gene not in symbol_to_entrez]
  print(f"Remaining missing symbols: {missing_symbols}")


  mapped_genes = [symbol_to_entrez[gene] for gene in gene_symbols if gene in symbol_to_entrez]
  print(f"Final mapped genes: {mapped_genes}")
  return mapped_genes


In [None]:

entrez_lists = {}
for phenotype, gene_list in {
    "endurance": endurance_only,
    "power": power_only,
    "strength": strength_only,
    "shared": shared_genes,
}.items():
  entrez_lists[phenotype] = get_entrez_ids(gene_list)
print(entrez_lists)
with open('entrez_lists.pkl', 'wb') as file:
  pickle.dump(entrez_lists, file)

INFO:biothings.client:querying 1-24 ...
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-23 ...


Symbol to Entrez mapping:
{'SIRT1': '23411', 'GABPB1': '2553', 'TFAM': '7019', 'NFATC4': '4776', 'TSHR': '7253', 'GSTP1': '2950', 'RBFOX1': '54715', 'BDKRB2': '624', 'UCP3': '7352', 'ADRB3': '155', 'PPP3R1': '5534', 'TRPM2': '7226', 'UCP2': '7351', 'PRDM1': '639', 'FTO': '79068', 'MYBPC3': '4607', 'COL5A1': '1289', 'HFE': '3077', 'VEGFA': '7422', 'SPEG': '10290', 'AQP1': '358', 'PPARGC1B': '133522'}
Remaining missing symbols: []
Final mapped genes: ['23411', '2553', '7019', '4776', '7253', '2950', '54715', '624', '7352', '155', '5534', '7226', '7351', '639', '79068', 3791, '4607', '1289', '3077', '7422', 6566, '10290', '358', '133522']


INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-27 ...


Symbol to Entrez mapping:
{'ZNF568': '374900', 'CCT3': '7203', 'FHL2': '2274', 'UBR5': '51366', 'CPNE5': '57699', 'HSD17B14': '51171', 'PIEZO1': '9780', 'NUP210': '23225', 'SOD2': '6648', 'ILRUN': '64771', 'TRHR': '7201', 'AUTS2': '26053', 'E2F3': '1871', 'ACVR1B': '91', 'GDF5': '8200', 'AGRN': '375790', 'IGF2': '3481', 'AKAP6': '9472', 'ADAM15': '8751', 'BDNF': '627', 'IGSF3': '3321', 'CRTC1': '23373'}
Remaining missing symbols: []
Final mapped genes: [9369, '374900', '7203', '2274', '51366', '57699', '51171', '9780', '23225', '6648', '64771', '7201', '26053', '1871', '91', '8200', '375790', '3481', '9472', '8751', '627', '3321', '23373']


INFO:biothings.client:Finished.
INFO:biothings.client:querying 1-6 ...


Symbol to Entrez mapping:
{'AR': '367', 'ANGPT2': '285', 'ALDH2': '217', 'PHACTR1': '221692', 'GBF1': '8729', 'ACTG1': '71', 'ITPR1': '3708', 'NPIPB6': '728741', 'ARPP21': '10777', 'PLEKHB1': '58473', 'ABHD17C': '58489', 'ADCY3': '109', 'ZNF608': '57507', 'LRPPRC': '10128', 'TFAP2D': '83741', 'ZKSCAN5': '23660', 'MMS22L': '253714', 'DHODH': '1723', 'KIF1B': '23095', 'RASGRF1': '5923', 'R3HDM1': '23518', 'MLN': '4295', 'GLIS3': '169792', 'BCDIN3D': '144233', 'ADPGK': '83440', 'GBE1': '2632', 'RMC1': '29919'}
Remaining missing symbols: []
Final mapped genes: ['367', '285', '217', '221692', '8729', '71', '3708', '728741', '10777', '58473', '58489', '109', '57507', '10128', '83741', '23660', '253714', '1723', '23095', '5923', '23518', '4295', '169792', '144233', '83440', '2632', '29919']


INFO:biothings.client:Finished.


Symbol to Entrez mapping:
{'HIF1A': '3091', 'ACE': '1636', 'ACTN3': '89', 'GALNTL6': '442117', 'CKM': '1158', 'PPARA': '5465'}
Remaining missing symbols: []
Final mapped genes: ['3091', '1636', '89', '442117', '1158', '5465']
{'endurance': ['23411', '2553', '7019', '4776', '7253', '2950', '54715', '624', '7352', '155', '5534', '7226', '7351', '639', '79068', 3791, '4607', '1289', '3077', '7422', 6566, '10290', '358', '133522'], 'power': [9369, '374900', '7203', '2274', '51366', '57699', '51171', '9780', '23225', '6648', '64771', '7201', '26053', '1871', '91', '8200', '375790', '3481', '9472', '8751', '627', '3321', '23373'], 'strength': ['367', '285', '217', '221692', '8729', '71', '3708', '728741', '10777', '58473', '58489', '109', '57507', '10128', '83741', '23660', '253714', '1723', '23095', '5923', '23518', '4295', '169792', '144233', '83440', '2632', '29919'], 'shared': ['3091', '1636', '89', '442117', '1158', '5465']}


In [None]:
new_entrez_lists = {}
for phenotype, gene_list in genes.items():
  new_entrez_lists[phenotype] = get_entrez_ids(gene_list)
print(new_entrez_lists)
with open('entrez_lists_wshared.pkl', 'wb') as file:
  pickle.dump(new_entrez_lists, file)

INFO:biothings.client:querying 1-38 ...
INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.
INFO:biothings.client:querying 1-42 ...


Symbol to Entrez mapping:
{'ACE': '1636', 'ACTN3': '89', 'ADRB2': '154', 'ADRB3': '155', 'AGTR2': '186', 'AQP1': '358', 'AMPD1': '270', 'BDKRB2': '624', 'CDKN1A': '1026', 'CKM': '1158', 'COL5A1': '1289', 'FTO': '79068', 'GABPB1': '2553', 'GALNTL6': '442117', 'GSTP1': '2950', 'HFE': '3077', 'HIF1A': '3091', 'MYBPC3': '4607', 'NFATC4': '4776', 'NOS3': '4846', 'PPARA': '5465', 'PPARGC1A': '10891', 'PPARGC1B': '133522', 'PPP3R1': '5534', 'PRDM1': '639', 'RBFOX1': '54715', 'SIRT1': '23411', 'SPEG': '10290', 'TFAM': '7019', 'TRPM2': '7226', 'TSHR': '7253', 'UCP2': '7351', 'UCP3': '7352', 'VEGFA': '7422'}
Remaining missing symbols: []
Final mapped genes: ['1636', '89', '154', '154', '155', '186', '358', '270', '624', '1026', '1158', '1289', '79068', '2553', '2553', '442117', '2950', '3077', '3091', 6566, '4607', '4776', '4846', '5465', '10891', '133522', '5534', '639', '54715', '23411', '10290', '7019', '7226', '7253', '7351', '7352', '7422', 3791]


INFO:biothings.client:Finished.
INFO:biothings.client:querying 1-45 ...


Symbol to Entrez mapping:
{'ACE': '1636', 'ACTN3': '89', 'CKM': '1158', 'GALNTL6': '442117', 'HIF1A': '3091', 'PPARA': '5465', 'PPARGC1A': '10891', 'AGT': '183', 'CNTFR': '1271', 'CRTAC1': '55118', 'IGF1': '3479', 'IL6': '3569', 'MTHFR': '4524', 'PPARG': '5468', 'SLC39A8': '64116', 'ABHD17C': '58489', 'ACTG1': '71', 'ADCY3': '109', 'ADPGK': '83440', 'ALDH2': '217', 'ANGPT2': '285', 'AR': '367', 'ARPP21': '10777', 'BCDIN3D': '144233', 'DHODH': '1723', 'GBE1': '2632', 'GBF1': '8729', 'GLIS3': '169792', 'ITPR1': '3708', 'KIF1B': '23095', 'LRPPRC': '10128', 'MLN': '4295', 'MMS22L': '253714', 'NPIPB6': '728741', 'PHACTR1': '221692', 'PLEKHB1': '58473', 'R3HDM1': '23518', 'RASGRF1': '5923', 'RMC1': '29919', 'TFAP2D': '83741', 'ZKSCAN5': '23660', 'ZNF608': '57507'}
Remaining missing symbols: []
Final mapped genes: ['1636', '89', '1158', '442117', '3091', '5465', '10891', '183', '1271', '55118', '3479', '3569', '4524', '5468', '64116', '58489', '71', '109', '83440', '217', '285', '367', '10777

INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


Symbol to Entrez mapping:
{'ACE': '1636', 'ACTN3': '89', 'ADRB2': '154', 'AGTR2': '186', 'AMPD1': '270', 'CDKN1A': '1026', 'CKM': '1158', 'GALNTL6': '442117', 'HIF1A': '3091', 'NOS3': '4846', 'PPARA': '5465', 'ACVR1B': '91', 'ADAM15': '8751', 'AGRN': '375790', 'AGT': '183', 'AKAP6': '9472', 'AUTS2': '26053', 'BDNF': '627', 'CCT3': '7203', 'CNTFR': '1271', 'CPNE5': '57699', 'CRTAC1': '55118', 'CRTC1': '23373', 'E2F3': '1871', 'FHL2': '2274', 'GDF5': '8200', 'HSD17B14': '51171', 'IGF1': '3479', 'IGF2': '3481', 'IGSF3': '3321', 'IL6': '3569', 'ILRUN': '64771', 'MTHFR': '4524', 'NUP210': '23225', 'PIEZO1': '9780', 'PPARG': '5468', 'SLC39A8': '64116', 'SOD2': '6648', 'TRHR': '7201', 'UBR5': '51366', 'ZNF568': '374900'}
Remaining missing symbols: []
Final mapped genes: ['1636', '89', '154', '154', '186', '270', '1026', '1158', '442117', '3091', '4846', '5465', '91', '8751', '375790', '183', '9472', '26053', '627', '7203', '1271', '57699', '55118', '23373', '1871', '2274', '8200', '51171', '3

In [None]:
print(new_entrez_lists)
for key in new_entrez_lists:
  print(len(new_entrez_lists[key]))
print(entrez_lists)
for key in entrez_lists:
  print(len(entrez_lists[key]))

{'E': ['1636', '89', '154', '154', '155', '186', '358', '270', '624', '1026', '1158', '1289', '79068', '2553', '2553', '442117', '2950', '3077', '3091', 6566, '4607', '4776', '4846', '5465', '10891', '133522', '5534', '639', '54715', '23411', '10290', '7019', '7226', '7253', '7351', '7352', '7422', 3791], 'S': ['1636', '89', '1158', '442117', '3091', '5465', '10891', '183', '1271', '55118', '3479', '3569', '4524', '5468', '64116', '58489', '71', '109', '83440', '217', '285', '367', '10777', '144233', '1723', '2632', '8729', '169792', '3708', '23095', '10128', '4295', '253714', '728741', '221692', '58473', '23518', '5923', '29919', '83741', '23660', '57507'], 'P': ['1636', '89', '154', '154', '186', '270', '1026', '1158', '442117', '3091', '4846', '5465', '91', '8751', '375790', '183', '9472', '26053', '627', '7203', '1271', '57699', '55118', '23373', '1871', '2274', '8200', '51171', '3479', '3481', '3321', '3569', '64771', '4524', 9369, '23225', '9780', '5468', '5468', '64116', '6648',