## Get data from PubMed

# Uniprot

In [1]:
import requests
from bs4 import BeautifulSoup

response = requests.get("https://rest.uniprot.org/diseases/DI-02060.rdf")
soup = BeautifulSoup(response.text, 'xml')
# Find the rdfs:comment element
comment = soup.find('rdfs:comment')

# Extract and print the disease definition
if comment:
    disease_definition = comment.get_text()
    print("Disease Definition:")
    print(disease_definition)
else:
    print("Disease definition not found.")

Disease Definition:
A multifactorial disorder of glucose homeostasis caused by a lack of sensitivity to insulin. Affected individuals usually have an obese body habitus and manifestations of a metabolic syndrome characterized by diabetes, insulin resistance, hypertension and hypertriglyceridemia. The disease results in long-term complications that affect the eyes, kidneys, nerves, and blood vessels.


In [7]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

# Base URL for DrugBank
base_url = "https://rest.uniprot.org/diseases/"

# Range of DrugBank IDs to scrape
start_id = 2000
end_id = 3000 # Example range, adjust as needed

# Initialize a list to store drug data
disease_data = []

for i in range(start_id, end_id + 1):
    disease_id = f"DI-0{i:04d}.rdf"
    url = f"{base_url}{disease_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        # XML instead of HTML
        soup = BeautifulSoup(response.content, 'xml')
        # Find the rdfs:comment element
        disease_name = soup.find('skos:prefLabel').get_text()
        comment = soup.find('rdfs:comment')

        if comment:
            disease_definition = comment.get_text()
            disease_data.append({
                "Disease Name": disease_name,
                "Disease Definition": disease_definition
            })
            print(f"Gathered data for {disease_name}")
        else:
            print(f"Disease definition for {disease_name} not found.")
            continue
        
        # Just so not to overwhelm the server 
        time.sleep(0.01)
    else:
        print(f"Failed to retrieve data for {disease_name}")

# Convert the list to a pandas DataFrame
df = pd.DataFrame(disease_data)

Gathered data for Muir-Torre syndrome
Gathered data for Mulibrey nanism
Gathered data for Congenital myopathy 1B, autosomal recessive
Gathered data for Hereditary leiomyomatosis and renal cell cancer
Gathered data for Multiple endocrine neoplasia 4
Gathered data for Tricho-rhino-phalangeal syndrome 2
Gathered data for Potocki-Shaffer syndrome
Gathered data for Multiple familial trichoepithelioma 1
Gathered data for Multiple neoplasia 2A
Gathered data for Multiple neoplasia 2B
Gathered data for Multiple synostoses syndrome 1
Gathered data for Multiple synostoses syndrome 2
Gathered data for Muscle glycogen storage disease 0
Gathered data for Glycogen storage disease 13
Failed to retrieve data for Glycogen storage disease 13
Gathered data for Immunodeficiency 68
Gathered data for Myeloperoxidase deficiency
Gathered data for Myocardial infarction 1
Gathered data for Mitochondrial DNA depletion syndrome 2
Gathered data for Myopathy with exercise intolerance Swedish type
Gathered data for M

In [10]:
import os
import pandas as pd

# Directory to save text files
output_dir = './data/uniprot/'
os.makedirs(output_dir, exist_ok=True)

# Function to sanitize filenames
def sanitize_filename(name):
    return "".join(c if c.isalnum() or c in (' ', '-', '_') else "_" for c in name)

# Function to save each row as a text file
def save_row_as_text(row, output_dir):
    file_name = sanitize_filename(row['Disease Name'].replace(' ', '_')) + ".txt"
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, 'w') as file:
        for column, value in row.items():
            file.write(f"{column}: {value}\n")

# Save each row in the dataframe as a text file
df.apply(lambda row: save_row_as_text(row, output_dir), axis=1)

# List the created files to verify
created_files = os.listdir(output_dir)
print(created_files)


['Cardiomyopathy__dilated__1V.txt', 'Osteoarthritis_5.txt', 'Peroxisome_biogenesis_disorder_complementation_group_13.txt', 'Lynch_syndrome_8.txt', 'Renal_cell_carcinoma.txt', 'Neuropathy__hereditary_sensory_and_autonomic__1C.txt', 'Parietal_foramina_with_cleidocranial_dysplasia.txt', 'Hemolytic_uremic_syndrome__atypical__5.txt', 'Type_2_diabetes_mellitus_1.txt', 'Seizures__sensorineural_deafness__ataxia__impaired_intellectual_development__and_electrolyte_imbalance.txt', 'Spermatogenic_failure_4.txt', 'Niemann-Pick_disease_C2.txt', 'Celiac_disease_4.txt', 'Cardiomyopathy__dilated__1FF.txt', 'Impaired_intellectual_development__anterior_maxillary_protrusion__and_strabismus.txt', 'Osteoglophonic_dysplasia.txt', 'Dominant_optic_atrophy_plus_syndrome.txt', 'Spondyloepimetaphyseal_dysplasia__short_limb-hand_type.txt', 'Spondyloepiphyseal_dysplasia_congenital_type.txt', 'Pierson_syndrome.txt', 'Pontocerebellar_hypoplasia_2A.txt', 'Klippel-Feil_syndrome_3__autosomal_dominant.txt', 'Spondyloepip

In [12]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv()
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
region_name = os.getenv('REGION_NAME')

def upload_to_s3(file_name, bucket, object_name=None, aws_access_key_id=None, aws_secret_access_key=None, region_name=None):
    """
    Upload a file to an S3 bucket using put_object

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified, file_name is used
    :param aws_access_key_id: AWS access key ID
    :param aws_secret_access_key: AWS secret access key
    :param region_name: AWS region name
    :return: True if file was uploaded, else False
    """
    
    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Remove spaces from the file name
    object_name = object_name.replace(' ', '_')

    # Create an S3 client
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )

    # Upload the file
    with open(file_name, 'rb') as file_data:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=file_data)
    print(f"File {file_name} uploaded to {bucket}/{object_name}")
    return True

def upload_directory_to_s3(directory, bucket, aws_access_key_id, aws_secret_access_key, region_name):
    """
    Upload all files in a directory to an S3 bucket

    :param directory: Directory containing files to upload
    :param bucket: Bucket to upload to
    :param aws_access_key_id: AWS access key ID
    :param aws_secret_access_key: AWS secret access key
    :param region_name: AWS region name
    """
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            object_name = f"raw/{file.replace(' ', '_')}"  # S3 object name without spaces
            upload_to_s3(file_path, bucket, object_name, aws_access_key_id, aws_secret_access_key, region_name)

# Example usage
directory = "./data/uniprot/"  # Local directory path
bucket_name = "data-chunking-us"

upload_directory_to_s3(directory, bucket_name, aws_access_key_id, aws_secret_access_key, region_name)


File ./data/uniprot/Cardiomyopathy__dilated__1V.txt uploaded to data-chunking-us/raw/Cardiomyopathy__dilated__1V.txt
File ./data/uniprot/Osteoarthritis_5.txt uploaded to data-chunking-us/raw/Osteoarthritis_5.txt
File ./data/uniprot/Peroxisome_biogenesis_disorder_complementation_group_13.txt uploaded to data-chunking-us/raw/Peroxisome_biogenesis_disorder_complementation_group_13.txt
File ./data/uniprot/Lynch_syndrome_8.txt uploaded to data-chunking-us/raw/Lynch_syndrome_8.txt
File ./data/uniprot/Renal_cell_carcinoma.txt uploaded to data-chunking-us/raw/Renal_cell_carcinoma.txt
File ./data/uniprot/Neuropathy__hereditary_sensory_and_autonomic__1C.txt uploaded to data-chunking-us/raw/Neuropathy__hereditary_sensory_and_autonomic__1C.txt
File ./data/uniprot/Parietal_foramina_with_cleidocranial_dysplasia.txt uploaded to data-chunking-us/raw/Parietal_foramina_with_cleidocranial_dysplasia.txt
File ./data/uniprot/Hemolytic_uremic_syndrome__atypical__5.txt uploaded to data-chunking-us/raw/Hemolyt