In [14]:
import sys
import pandas as pd
import numpy as np
from multiprocess import Pool


class Hugoify:
    def __init__(self, data, num_workers=7):
        ## File destination must be of type excel
        self.num_workers = 7
        if isinstance(data, str):
            self.df = pd.read_excel(data)
        elif isinstance(data, pd.DataFrame):
            self.df = df
        else:
            raise Exception("Must be a DataFrame object or a path to an excel file")
    def get_missing_ids(self):
        ## First adgument: Make function that inputs a enterez id and spits out a hugo id
        ## Second argument: Make a list of all of the missing 
        missing = []
        values = self.df.values
        for i in range(len(values)):  #
            if type(values[i][0]) is type(3.6):
                enterez = values[i][0]
                missing.append({'index':i, 'enterez':enterez})

        p = Pool(self.num_workers)
        hugo_ids = p.map(get_id, missing)
        p.terminate()
        p.join()
        
        ## Change Hugo Ids to new ids
        number_found = 0
        for i in range(len(hugo_ids)):
            self.df.loc[missing[i]['index'],'Hugo_Symbol'] = hugo_ids[i]
            if isinstance(hugo_ids[i],str):
                number_found+=1
        print("Found ", number_found, "/", len(missing), "of the missing values")
        return self.df  

        
def get_id(missing):
    from urllib.parse import urlparse
    import requests
    from bs4 import BeautifulSoup
    import numpy as np
    URL = 'https://www.ncbi.nlm.nih.gov/gene/?term='
    
    def get_text(page):
        soup = BeautifulSoup(page.content, 'html.parser')

        # kill all script and style elements
        spans = soup.findAll("dd", {"class": "noline"})
        if len(spans) ==0:
            return np.nan
        hugo = str(spans[0].contents[0])
        if hugo == "" or hugo == None or len(hugo) == 0:
            return np.nan

        return hugo

    def get_hugo(URL):
        headers = requests.utils.default_headers()
        headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
            'referer': 'https://www.google.com/search?q=genecards+729&rlz=1C1CHBF_enUS776US776&oq=genecards+729&aqs=chrome..69i57j69i64j69i60l3.9462j0j7&sourceid=chrome&ie=UTF-8'
        })
        page = requests.get(URL, headers=headers)
        if page.status_code == 200: 
            return get_text(page)
        else:
            return np.nan

    newURL = newURL = urlparse(URL+str(missing['enterez']))
    value=get_hugo(newURL.geturl())
    return value

In [15]:
h = Hugoify("C:/Users/NathanGrant/Programs/TripodsREU/data/uvm_tcga_pan_can_atlas_2018/data_RNA_Seq_v2_expression_median_hugoify_test.xlsx")

In [16]:
h.df.head(35)

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-RZ-AB0B-01,TCGA-V3-A9ZX-01,TCGA-V3-A9ZY-01,TCGA-V4-A9E5-01,TCGA-V4-A9E7-01,TCGA-V4-A9E8-01,TCGA-V4-A9E9-01,TCGA-V4-A9EA-01,...,TCGA-WC-A885-01,TCGA-WC-A888-01,TCGA-WC-A88A-01,TCGA-WC-AA9A-01,TCGA-WC-AA9E-01,TCGA-YZ-A980-01,TCGA-YZ-A982-01,TCGA-YZ-A983-01,TCGA-YZ-A984-01,TCGA-YZ-A985-01
0,,100130426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,100133144,7.9418,0.0219,0.0,2.9398,1.1899,0.0,0.7273,0.0,...,0.0,0.0,0.6219,1.3984,15.3168,4.1389,1.7699,3.2451,0.0,0.0
2,,100134869,4.8264,2.1699,0.7299,2.607,4.7598,1.9773,4.3636,4.5249,...,4.2702,5.3313,2.4876,9.8639,4.6212,7.7805,0.0,10.9175,0.9279,3.403
3,,10357,91.0674,91.2986,29.1022,20.6022,55.0171,34.4834,32.2545,8.8462,...,39.3498,76.8241,70.2985,90.9479,86.296,58.9273,39.6018,59.2057,31.3802,30.298
4,,10431,964.76,869.041,640.146,780.507,1287.52,1088.48,879.273,558.823,...,763.296,1105.1,970.149,1436.41,951.706,801.069,687.611,717.365,806.309,637.494
5,,136542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,,155060,174.668,102.466,93.4307,33.2805,93.4107,68.2155,21.0909,54.2986,...,35.229,117.289,56.592,165.65,192.734,174.27,54.8673,183.498,33.4029,46.5075
7,,26823,0.5107,0.0,0.0,0.0,0.595,0.0,0.0,0.0,...,0.0,0.7616,0.0,0.4693,1.7723,0.411,0.0,0.0,0.0,0.0
8,,280660,0.5107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8557,0.0
9,,317712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df = h.get_missing_ids()

Found  17 / 29 of the missing values


In [18]:
df.head(35)

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-RZ-AB0B-01,TCGA-V3-A9ZX-01,TCGA-V3-A9ZY-01,TCGA-V4-A9E5-01,TCGA-V4-A9E7-01,TCGA-V4-A9E8-01,TCGA-V4-A9E9-01,TCGA-V4-A9EA-01,...,TCGA-WC-A885-01,TCGA-WC-A888-01,TCGA-WC-A88A-01,TCGA-WC-AA9A-01,TCGA-WC-AA9E-01,TCGA-YZ-A980-01,TCGA-YZ-A982-01,TCGA-YZ-A983-01,TCGA-YZ-A984-01,TCGA-YZ-A985-01
0,,100130426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,100133144,7.9418,0.0219,0.0,2.9398,1.1899,0.0,0.7273,0.0,...,0.0,0.0,0.6219,1.3984,15.3168,4.1389,1.7699,3.2451,0.0,0.0
2,UBE2Q2P2,100134869,4.8264,2.1699,0.7299,2.607,4.7598,1.9773,4.3636,4.5249,...,4.2702,5.3313,2.4876,9.8639,4.6212,7.7805,0.0,10.9175,0.9279,3.403
3,HMGB1P1,10357,91.0674,91.2986,29.1022,20.6022,55.0171,34.4834,32.2545,8.8462,...,39.3498,76.8241,70.2985,90.9479,86.296,58.9273,39.6018,59.2057,31.3802,30.298
4,,10431,964.76,869.041,640.146,780.507,1287.52,1088.48,879.273,558.823,...,763.296,1105.1,970.149,1436.41,951.706,801.069,687.611,717.365,806.309,637.494
5,,136542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,LOC155060,155060,174.668,102.466,93.4307,33.2805,93.4107,68.2155,21.0909,54.2986,...,35.229,117.289,56.592,165.65,192.734,174.27,54.8673,183.498,33.4029,46.5075
7,RNU12-2P,26823,0.5107,0.0,0.0,0.0,0.595,0.0,0.0,0.0,...,0.0,0.7616,0.0,0.4693,1.7723,0.411,0.0,0.0,0.0,0.0
8,SSX9P,280660,0.5107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8557,0.0
9,,317712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
if __name__ == '__main__':
    
    

done1


In [3]:
from hugoify import Hugoify

In [4]:
h = Hugoify("C:/Users/NathanGrant/Programs/TripodsREU/data/uvm_tcga_pan_can_atlas_2018/data_RNA_Seq_v2_expression_median_hugoify_test.xlsx",num_workers=1)

In [5]:
df = h.get_missing_ids()
df.head(35)

Found  17 / 29 of the missing values


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-RZ-AB0B-01,TCGA-V3-A9ZX-01,TCGA-V3-A9ZY-01,TCGA-V4-A9E5-01,TCGA-V4-A9E7-01,TCGA-V4-A9E8-01,TCGA-V4-A9E9-01,TCGA-V4-A9EA-01,...,TCGA-WC-A885-01,TCGA-WC-A888-01,TCGA-WC-A88A-01,TCGA-WC-AA9A-01,TCGA-WC-AA9E-01,TCGA-YZ-A980-01,TCGA-YZ-A982-01,TCGA-YZ-A983-01,TCGA-YZ-A984-01,TCGA-YZ-A985-01
0,,100130426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,100133144,7.9418,0.0219,0.0,2.9398,1.1899,0.0,0.7273,0.0,...,0.0,0.0,0.6219,1.3984,15.3168,4.1389,1.7699,3.2451,0.0,0.0
2,UBE2Q2P2,100134869,4.8264,2.1699,0.7299,2.607,4.7598,1.9773,4.3636,4.5249,...,4.2702,5.3313,2.4876,9.8639,4.6212,7.7805,0.0,10.9175,0.9279,3.403
3,HMGB1P1,10357,91.0674,91.2986,29.1022,20.6022,55.0171,34.4834,32.2545,8.8462,...,39.3498,76.8241,70.2985,90.9479,86.296,58.9273,39.6018,59.2057,31.3802,30.298
4,,10431,964.76,869.041,640.146,780.507,1287.52,1088.48,879.273,558.823,...,763.296,1105.1,970.149,1436.41,951.706,801.069,687.611,717.365,806.309,637.494
5,,136542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,LOC155060,155060,174.668,102.466,93.4307,33.2805,93.4107,68.2155,21.0909,54.2986,...,35.229,117.289,56.592,165.65,192.734,174.27,54.8673,183.498,33.4029,46.5075
7,RNU12-2P,26823,0.5107,0.0,0.0,0.0,0.595,0.0,0.0,0.0,...,0.0,0.7616,0.0,0.4693,1.7723,0.411,0.0,0.0,0.0,0.0
8,SSX9P,280660,0.5107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.8557,0.0
9,,317712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
