In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from striprtf.striprtf import rtf_to_text
import re

# Path to your RTF file
file_path = '/Users/noahnicol/Desktop/Desktop/School/2024-25/Lu Lab/GSE36054.rtf'

# Step 1: Read and parse the RTF file
print("Reading and parsing the RTF file...")
try:
    with open(file_path, 'r') as file:
        rtf_content = file.read()
    text_content = rtf_to_text(rtf_content)
    print("RTF file successfully parsed.")
except Exception as e:
    print(f"Error reading the RTF file: {e}")
    exit()

Reading and parsing the RTF file...
RTF file successfully parsed.


In [33]:
# Step 2: Extract GSM IDs, URLs, and sample titles
print("Extracting GSM IDs, URLs, and sample titles from the RTF content...")
# Updated regex to capture the new format
sample_data = re.findall(r'(GSM\d+)\("([^"]+)"\)\|\|\s*\|\s*([A-Za-z\s]+[A-Z]+\d+)\|', text_content)

if not sample_data:
    print("No GSM IDs, URLs, and sample titles found in the RTF file. Here’s the content for debugging:")
    for line in text_content.splitlines():
        print(line)
else:
    print(f"Found {len(sample_data)} samples.")
    for gsm_id, url, sample_title in sample_data[:5]:  # Show the first 5 samples for verification
        print(f"GSM ID: {gsm_id}, URL: {url}, Sample Title: {sample_title}")

# Define the base URL for constructing full URLs
base_url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='

# List to store the extracted data
data = []

# Function to extract demographic information from each individual's page
def extract_info(soup, gsm_id, sample_title):
    info = {'GSM_ID': gsm_id, 'Sample_Title': sample_title, 'Age': None, 
            'Gender': None, 'Ethnicity': None, 'cell type': None, 'cell type': None}
    # info = {'GSM_ID': gsm_id, 'Sample_Title': sample_title, 'Age': None, 
    #         'Gender': None}

    # Locate the 'Characteristics' section
    characteristics_section = soup.find('td', text='Characteristics')
    if characteristics_section:
        # print("Found 'Characteristics' section.")
        details_td = characteristics_section.find_next_sibling('td')
        details = [line.strip() for line in details_td.get_text(separator="|").split('|')]
        for detail in details:
            detail = detail.lower()
            if 'age' in detail:
                info['Age'] = detail.split(':')[-1].strip()
            elif 'gender' in detail:
                info['Gender'] = detail.split(':')[-1].strip()
            elif 'ethnicity' in detail:
                info['Ethnicity'] = detail.split(':')[-1].strip()
            elif 'tissue' in detail:
                info['cell type'] = detail.split(':')[-1].strip()
            # elif 'smoking' in detail:
            #     info['Smoking'] = detail.split(':')[-1].strip()
    else:
        print("No 'Characteristics' section found for this sample.")

    return info


Extracting GSM IDs, URLs, and sample titles from the RTF content...
Found 143 samples.
GSM ID: GSM879995, URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM879995, Sample Title: Harvard Sib female CHB1000005838
GSM ID: GSM879996, URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM879996, Sample Title: Harvard Sib CHB1000006589
GSM ID: GSM879997, URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM879997, Sample Title: Harvard Sib female CHB1000005934
GSM ID: GSM879998, URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM879998, Sample Title: Harvard Sib CHB1000005930
GSM ID: GSM879999, URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM879999, Sample Title: Harvard Sib CHB1000005850


In [34]:
base_url = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='

# Step 3: Loop through each GSM ID and sample title, and construct the URL
for gsm_id, _, sample_title in sample_data:
    # Construct the full URL
    url = base_url + gsm_id
    # print(f"\nScraping {url}...")
    response = requests.get(url)
    if response.status_code == 200:
        # print(f"Successfully accessed {url}")
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            individual_info = extract_info(soup, gsm_id, sample_title)
            data.append(individual_info)
            print(f"Data extracted for {gsm_id}: {individual_info}")
        except Exception as e:
            print(f"Error extracting data for {gsm_id}: {e}")
    else:
        print(f"Failed to retrieve {url} (status code: {response.status_code})")

  characteristics_section = soup.find('td', text='Characteristics')


Data extracted for GSM879995: {'GSM_ID': 'GSM879995', 'Sample_Title': 'Harvard Sib female CHB1000005838', 'Age': '29', 'Gender': 'f', 'Ethnicity': 'black', 'cell type': None}
Data extracted for GSM879996: {'GSM_ID': 'GSM879996', 'Sample_Title': 'Harvard Sib CHB1000006589', 'Age': '58', 'Gender': 'm', 'Ethnicity': 'white', 'cell type': None}
Data extracted for GSM879997: {'GSM_ID': 'GSM879997', 'Sample_Title': 'Harvard Sib female CHB1000005934', 'Age': '55', 'Gender': 'f', 'Ethnicity': 'black', 'cell type': None}
Data extracted for GSM879998: {'GSM_ID': 'GSM879998', 'Sample_Title': 'Harvard Sib CHB1000005930', 'Age': '37', 'Gender': 'm', 'Ethnicity': 'other', 'cell type': None}
Data extracted for GSM879999: {'GSM_ID': 'GSM879999', 'Sample_Title': 'Harvard Sib CHB1000005850', 'Age': '193', 'Gender': 'm', 'Ethnicity': 'other', 'cell type': None}
Data extracted for GSM880000: {'GSM_ID': 'GSM880000', 'Sample_Title': 'Harvard Sib CHB1000006609', 'Age': '42', 'Gender': 'm', 'Ethnicity': 'blac

In [36]:

# Step 4: Convert the data to a DataFrame and save to CSV
print("\nConverting data to DataFrame and saving to CSV...")
output_file_name = 'GEO_data_GSE36054.csv'
try:
    df = pd.DataFrame(data)
    df.to_csv(output_file_name, index=False)
    print("Data successfully saved to", output_file_name)
except Exception as e:
    print(f"Error saving data to CSV: {e}")


Converting data to DataFrame and saving to CSV...
Data successfully saved to GEO_data_GSE36054.csv
