In [None]:
# pip install selenium

In [1]:
import pandas as pd
from scipy.stats import pearsonr



In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

import os

## Get SCLC and EMT genes

In [3]:
SCLC_genes = {'AGER', 'SFTPC', 'GSTA1', 'CYP2B7P', 'AQP4', 'CLDN18', 'C4BPA', 'ADH1B', 'LRRK2', 'MSMB',
             'TMSB15B', 'BEX1', 'DLK1', 'TOP2A', 'ASCL1', 'HIST1H3C', 'HIST1H3B', 'DCX', 'TUBB2B', 'HIST1H3I',
              'FLI1', 'TCF4' 'TCF3', 'ELF3', 'NR0B1', 'NEUROD2', 'SOX11', 'TEAD4', 'REST', 'MITF', 'SIX5',
              'ZNF217', 'FOXA1', 'FOXA2', 'SMAD4', 'OLIG2', 'FLI1', 'SMAD4', 'GATA4', 'ISL1', 'SYP', 'CHGA',
              'INSM1', 'NEUROD1', 'POU2F3', 'YAP1', 'MYC', 'MYCL', 'MYCN'
             }

# EMT genes that are differentially expressed across they 5 subtypes
EMT_gene_expr = pd.read_csv('Data/ANOVA_filtered_EMT_gene_expr.csv')
EMT_gene_expr.set_index('Gene', inplace=True)

EMT_genes = set(EMT_gene_expr.index)


SCLC_EMT_genes = SCLC_genes.union(EMT_genes)

## Function to parse through Harmonizome and get associations

In [9]:
def get_targets(get, pause):
    # SCLC/EMT gene with associations identified
    added = []
    
    # SCLC/EMT gene with no associations identified
    no_data = []
    
    for gene in get:

        # Get url
        url = 'https://maayanlab.cloud/Harmonizome/gene/' + gene

        try:
            options = Options()

            # Change </Path/Data/EMT_SCLC_Targets> to desired file download location
            prefs = {'download.default_directory' : '</Path/Data/EMT_SCLC_Targets>'}
            options.add_experimental_option('prefs', prefs)

            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.get(url)

            # Identify Download Associations button
            download = driver.find_element(By.CSS_SELECTOR, "button.glyphicon-download-alt");
            
            # Click and download file
            download.click()
            
            # Allow time for file to download
            time.sleep(pause)

            driver.quit()

            added.append(gene)

        except:
            no_data.append(gene)
            
    return(added, no_data)

In [10]:
# Download associations for all SCLC/EMT genes with 8 seconds of pause for file to download
no_data = SCLC_EMT_genes

while len(no_data) > 0:
    added, no_data = get_targets(no_data, 8)
    print('\nAdded regulations of', len(added), 'SCLC/EMT genes')
    print('No regulation info of', len(no_data), 'SCLC/EMT genes')




[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/livtoft/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache





[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/livtoft/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache





[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/livtoft/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache





[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/livtoft/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache


Added regulations of 4 SCLC/EMT genes
No regulation info of 0 SCLC/EMT genes


## Function to retrieve associations that failed to download

In [11]:
def get_failed_downloads():
    # Get the directory
    directory = os.fsencode('Data/EMT_SCLC_Targets')
    failed = []

    # Iterate through all files
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        
        # .crdownload files = files which were not given enough time to download
        if filename.endswith(".crdownload"): 
            # Get the name of the gene whose associations failed to download
            failed.append(filename.split('associations')[0])
            
            # Remove failed download file
            path = 'Data/EMT_SCLC_Targets/' + filename
            os.remove(path)
    
    return(failed)

In [12]:
# Re-download all failed files

failed  = get_failed_downloads()
pause = 15

while len(failed) > 0:
    print('failed to download regulations of', len(failed), 'SCLC/EMT genes')
    print('increasing download pause to', pause, 'seconds')
    
    added, no_data = get_targets(failed, pause)
    
    pause += 5
    failed  = get_failed_downloads()

failed to download regulations of 2 SCLC/EMT genes
increasing download pause to 10 seconds



[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/livtoft/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache





[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - Driver [/Users/livtoft/.wdm/drivers/chromedriver/mac64/103.0.5060.53/chromedriver] found in cache


In [13]:
# Number of associations 
# (should be 260 if all SCLC/EMT genes have association data)
len(os.listdir(os.fsencode('Data/EMT_SCLC_targets')))

260

# Determine repression/activation

In [4]:
# Get gene expression data
gene_expr = pd.read_csv('Data/SCLC_combined_Minna_CCLE.csv')
gene_expr.set_index('Unnamed: 0', inplace=True)

In [5]:
# Define master dataframe with all associations
SCLC_EMT_ENCODE_associations = pd.DataFrame()

In [6]:
# UNCOMMENT IF: some SCLC/EMT genes do not have expression data

# SCLC_EMT_genes = []
# for file in os.listdir(os.fsencode('Data/EMT_SCLC_Targets')):
#    filename = os.fsdecode(file)
#    gene = filename.split(' ')[0]
#    SCLC_EMT_genes.append(gene)

In [6]:
for gene in SCLC_EMT_genes:
    
    if gene in gene_expr.index:
        # Get associations from folder
        file = 'Data/EMT_SCLC_Targets/' + gene + ' associations.tsv' 
        assoc = pd.read_csv(file, sep='\t')

        # Get columns of interest
        assoc = pd.DataFrame(assoc.iloc[:, :2])
        assoc.set_index('association', inplace=True)

        # Get associations from ENCODE TF Targets dataset
        assoc = assoc.where(assoc['dataset'] == 'ENCODE Transcription Factor Targets').dropna()

        # Add associated gene's SCLC/EMT gene target
        assoc['target'] = [gene] * len(assoc)


        # Get repression/activation
        for a in assoc.index:
            if a in gene_expr.index: 

                # Pearson correlation and p value
                corr, p_val = pearsonr(gene_expr.loc[a, :], gene_expr.loc[gene, :])

                # Keep association if expressions are significantly correlated
                if p_val <= 0.05:

                    # +ve correlation = activation
                    if corr > 0:
                        assoc.loc[a, 'repress'] = False

                    # -ve correlation = repression
                    elif corr < 0:
                        assoc.loc[a, 'repress'] = True
                else:
                    assoc.drop(index=a, inplace=True)

            # Don't include associations with no gene expression data
            else:
                assoc.drop(index=a, inplace=True)


        assoc.reset_index(inplace=True)

        # Add associations to master dataframe
        SCLC_EMT_ENCODE_associations = pd.concat([SCLC_EMT_ENCODE_associations, assoc])

KeyboardInterrupt: 

In [35]:
SCLC_EMT_ENCODE_associations = SCLC_EMT_ENCODE_associations.reset_index()

In [36]:
SCLC_EMT_ENCODE_associations.to_csv('Data/SCLC_EMT_ENCODE_associations.csv')