In [4]:
import pandas as pd
df = pd.read_csv("data/rummagenexrummageo_sym.csv")

# df = pd.read_csv("data/rummagenexrummageo.csv")
df

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_datetime_CAPI' (most likely due to a circular import)

In [None]:
import requests
import xmltodict
import time
import pandas as pd

def fetch_pmid_from_pmc(pmc_ids, max_retries=10):
    rar = {}
    ids = ','.join(pmc_ids)
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={ids}&tool=my_tool&email=my_email@example.com"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            # Parse the response
            data = xmltodict.parse(response.content)
            
            # Extract PubMed ID
            if 'record' in data["pmcids"]:
                pmc_data = data["pmcids"]['record']
                if isinstance(pmc_data, list):
                    for dicte in pmc_data:
                        if "@pmid" in dicte:
                            rar[dicte["@requested-id"]] = dicte["@pmid"]
                            # lala.append(dicte["@pmid"])
                        else:
                            rar[dicte["@requested-id"]] = "Empty"

                else:
                    if "@pmid" in pmc_data:
                        rar[pmc_data["@requested-id"]] = pmc_data["@pmid"]
                    else:
                        rar[pmc_data["@requested-id"]] = "Empty"

                    
                    
            return rar
        except (TypeError,KeyError) as e:
            print(pmc_ids)
            print(len(pmc_ids))
            print(f"TypeError: {data}")
            print(f"TypeError: {pmc_data}")

            raise e
        
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Attempt {attempt + 1} of {max_retries}")
            if attempt < max_retries - 1:
                # Exponential backoff: wait a bit before retrying
                wait_time = 2 ** attempt  # 2^0, 2^1, 2^2, ..., 2^(attempt-1)
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Unable to fetch data.{pmc_ids}")
                return None


In [None]:
from tqdm import tqdm
import json
import os
def process_pmc_lst(tuples_list):
    batch_size = 200  
    if os.path.exists("data/pmids_dict.json"):
        with open("data/pmids_dict.json", 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    tuples_list.difference_update(all_results.keys())
    tuples_list = list(tuples_list)

    total_batches = (len(tuples_list) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(list(tuples_list)), batch_size), total=total_batches, desc="Processing Batches"):
        pmc_ids= tuples_list[i:i + batch_size]
        pmc_ids = set(pmc_ids) 
        pmc_ids.difference_update(all_results.keys())
        if len(pmc_ids) > 0:
            r = fetch_pmid_from_pmc(pmc_ids)
            all_results.update(r)
        with open("data/pmids_dict.json", 'w') as f:
            json.dump(all_results, f, indent=4)
        time.sleep(1)  # Delay to avoid hitting rate limits
    return all_results



In [None]:
import shutil

def empty_directory(directory_path):
    """
    Empties the specified directory if it isn't empty.

    Parameters:
    directory_path (str): The path to the directory to be emptied.
    """
    if not os.path.exists(directory_path):
        print(f"The directory {directory_path} does not exist.")
        return
    
    if not os.path.isdir(directory_path):
        print(f"{directory_path} is not a directory.")
        return
    
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Removed file: {file_path}")
            
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        
        except Exception as e:
            print(f"Failed to remove {file_path}: {e}")



In [None]:
import GEOparse


def get_metadata_from_geo(gse_id):
    for i in range(10):
        try:
            metadata = GEOparse.get_GEO(geo=gse_id, silent=True, destdir="GEO")
            metadata = metadata.metadata
            
            if isinstance(metadata, str) and "Error: Download failed due to " in metadata:
                raise IOError()
            
            empty_directory("GEO")
            return metadata
        
        except IOError as e:
            print(f"Attempt {i+1}/10: Error downloading GEO data: {e}")
            
            if i == 9:  # On the last attempt, write to the file
                with open("data/GSEs_to_delete.txt", 'a') as file:
                    file.write(f"{gse_id}\n")
            
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            # Handle or log the unexpected error as needed

In [None]:
import pandas as pd

def filter_by_dict(df, col1, col2, lookup_dict):
    """
    Filters out rows from the DataFrame where the value in `col1` is present 
    in the dictionary values whose key is the value in `col2`. Goal is to remove redundant studies

    Returns:
    pd.DataFrame: The filtered DataFrame.
    """
    def row_should_keep(row):
        key = row[col2]
        value = row[col1]
        # Check if the value is in the dictionary's value for the given key
        return not (key in lookup_dict and value in lookup_dict[key])
    
    tqdm.pandas()  
    df_filtered = df[df.progress_apply(row_should_keep, axis=1)]    
    return df_filtered



In [None]:
df["geo_gse"] = df["rummageo"].str.split("-").str[0]
df["geo_gse"] = df["geo_gse"].str.split(",").str[0]
df["pmc_id"] = df["rummagene"].str.split("-").str[0]
df

In [None]:
import json
from tqdm import tqdm
def process_geos_meta(lst):
    lst = set(lst)
    if os.path.exists("data/gse_results.json"):
        with open("data/gse_results.json", 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}
    lst.difference_update(all_results.keys())
    lst = list(lst)
    if len(lst) > 0:
        for ele in tqdm(lst, desc="Processing items"):
            all_results[ele] = get_metadata_from_geo(ele)
            with open("data/gse_results.json", 'w') as f:
                json.dump(all_results, f, indent=4)
    return


def process_geos_pmids(lst):
    lst = set(lst)

    if not os.path.exists("data/gse_results.json"):
        print("Error: File 'data/gse_results.json' does not exist.")
        return

    with open("data/gse_results.json", 'r') as f:
        all_res = json.load(f)

    try:
        pmids = {
            ele: all_res[ele].get("pubmed_id", ["Empty"])
            for ele in lst
            if isinstance(all_res.get(ele), dict)  
        }

        with open("data/pmids_dict_gse.json", 'w') as f:
            json.dump(pmids, f, indent=4)

        return 

    except Exception as e:
        print(f"An error occurred: {e}")

   

In [None]:

process_geos_meta(list(df["geo_gse"]))


In [None]:
try:
    with open("data/GSEs_to_delete.txt", 'r') as file:
        gse_to_delete = {line.strip() for line in file} 
        print(gse_to_delete)
    df= df[~df["geo_gse"].isin(gse_to_delete)]
    df
except:
    pass


In [None]:
lst = set(list(df["geo_gse"]))
retrieved_list = list(lst)
process_geos_pmids(retrieved_list)

In [None]:
pmc_pmids = process_pmc_lst(set(df["pmc_id"]))

In [None]:
with open("data/pmids_dict.json", 'r') as f:
        all_results = json.load(f)
df['pmid'] = df['pmc_id'].map(all_results)
df

In [None]:
with open("data/pmids_dict_gse.json", 'r') as f:
        all_results = json.load(f)
all_results


In [None]:
df = filter_by_dict(df, "pmid", "geo_gse", all_results)
df

In [None]:
with open("data/pmids_dict_gse.json") as f:
    gses = json.load(f)
gses = {ele:','.join(gses[ele]) for ele in gses}
df["geo_pmid"] = df["geo_gse"].map(gses)
df


In [None]:
df  = df.drop_duplicates(subset=["rummagene","rummageo"])

In [None]:
from bs4 import BeautifulSoup

def get_pubmed_abstract(pmid):
    url = f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the abstract section using PubMed's HTML structure
        abstract_section = soup.find('div', class_='abstract-content')
        if abstract_section:
            abstract = abstract_section.get_text(separator=" ", strip=True)
            return abstract
        else:
            return "Abstract not found"
    else:
        return "Failed to fetch the page"

In [None]:
abs_dict = {}
df_filt = df.head(1000000) #restrict to top 1 million since those will be served to user
df_filt = df_filt.drop_duplicates(subset="pmc_id")
df_filt = df[["pmc_id", "pmid"]]
df_filt = df_filt.drop_duplicates(subset="pmc_id")

In [None]:
df_filt = df[["pmc_id", "pmid"]]
df_filt = df_filt.drop_duplicates(subset="pmc_id")
result_dicte = dict(zip( df_filt['pmc_id'], df_filt['pmid']))
res_dict = {key:result_dicte[key] for key in result_dicte if key not in abs_dict}


In [None]:
file_path1 = "data/output1.txt"
file_path2 = "data/PMCs_to_delete.txt"

In [None]:
if os.path.exists(file_path1):
    with open(file_path1, 'r') as f:
        pmcs_found = {line.strip() for line in file}
if os.path.exists(file_path2):
    with open(file_path2, 'r') as f:
        gses_to_be_deleted = {line.strip() for line in file}

In [None]:
from bs4 import BeautifulSoup

def get_pubmed_title_and_abstract(pmid):
    url = f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the title
        title_section = soup.find('h1', class_='heading-title')
        if title_section:
            title = title_section.get_text(separator=" ", strip=True)
        else:
            title = "Title not found"

        # Find the abstract
        abstract_section = soup.find('div', class_='abstract-content')
        if abstract_section:
            abstract = abstract_section.get_text(separator=" ", strip=True)
        else:
            abstract = "Abstract not found"
        
        return title, abstract
    else:
        raise ValueError



In [None]:
def get_pmc_title_and_abstract(pmcid):
    url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the title
        title_section = soup.find('h1', class_='content-title')
        if title_section:
            title = title_section.get_text(separator=" ", strip=True)
        else:
            title = "Title not found"

        # Find the abstract
        abstract_section = soup.find('div', class_='tsec sec')
        if abstract_section:
            abstract = abstract_section.get_text(separator=" ", strip=True)
        else:
            abstract = "Abstract not found"

        return title, abstract
    else:
        print(response.status_code)
        raise ValueError


In [None]:
curr_count = len(abs_dict)
for pmcid in res_dict:  
    pmid = res_dict[pmcid]
    if pmcid not in pmcs_found and pmcid not in gses_to_be_deleted:
        if pmid != "Empty":
            title, abstract = get_pubmed_title_and_abstract(pmid)
            abs_dict[pmcid] = {}
            abs_dict[pmcid]["title"] = title
            abs_dict[pmcid]["abstract"] = abstract
            pmcs_found.add(pmcid)
            with open(file_path1, "a") as file:
                file.write(f"{pmcid}\n")
        else:
            try:
                title, abstract = get_pmc_title_and_abstract(pmcid)
                abs_dict[pmcid] = {}
                abs_dict[pmcid]["title"] = title
                abs_dict[pmcid] ["abstract"] = abstract
                pmcs_found.add(pmcid)
                with open(file_path1, "a") as file:
                    file.write(f"{pmcid}\n")
            except:
                print(pmcid)
                gses_to_be_deleted.add(pmcid)
                with open(file_path2, "a") as file:
                    file.write(f"{pmcid}\n")
    if len(abs_dict) > curr_count:
        with open("data/title_abs.json", 'w') as f:
                json.dump(abs_dict, f, indent=2)
        curr_count = len(abs_dict)
            


In [None]:
df = df[~df['pmc_id'].isin(gse_to_delete)]

In [None]:
df.to_csv('data/rummagenexrummageo_cleaned.csv', index=False)
