# KG-Alzheimers Processing Notebook

This notebook:
1. Downloads the KG-Alzheimers dataset from kghub.io
2. Extracts the tar.gz file
3. Prunes the nodes and edges TSV files to include only specific columns
4. Repackages the pruned data into a new tar.gz file

In [None]:
# Import necessary libraries
import os
import requests
import pandas as pd
import tarfile
import shutil
from tqdm.notebook import tqdm

In [None]:
# Set up constants
URL = "https://kghub.io/kg-alzheimers/20250317/kg-alzheimers.tar.gz"
DOWNLOAD_PATH = "kg-alzheimers.tar.gz"
EXTRACT_DIR = "kg-alzheimers-extracted"
PRUNED_DIR = "kg-alzheimers-pruned"
OUTPUT_FILE = "kg-alzheimers-pruned.tar.gz"

In [None]:
# Download the dataset
def download_file(url, save_path):
    print(f"Downloading {url} to {save_path}...")
    if os.path.exists(save_path):
        print(f"File already exists at {save_path}. Skipping download.")
        return
    
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for HTTP errors
    
    # Get file size for progress bar
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    
    with open(save_path, 'wb') as file, tqdm(
        desc=save_path,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(block_size):
            file.write(data)
            bar.update(len(data))
    
    print("Download complete!")

In [None]:
# Download the dataset
download_file(URL, DOWNLOAD_PATH)

In [None]:
# Extract the tar.gz file
def extract_tarfile(tarfile_path, extract_dir):
    print(f"Extracting {tarfile_path} to {extract_dir}...")
    if os.path.exists(extract_dir):
        print(f"Directory {extract_dir} already exists. Removing it.")
        shutil.rmtree(extract_dir)
    
    os.makedirs(extract_dir, exist_ok=True)
    
    with tarfile.open(tarfile_path, 'r:gz') as tar:
        tar.extractall(path=extract_dir)
    
    print("Extraction complete!")

In [None]:
# Extract the dataset
extract_tarfile(DOWNLOAD_PATH, EXTRACT_DIR)

In [None]:
# List files in the extracted directory
os.listdir(EXTRACT_DIR)

In [None]:
# Define which columns to keep in the nodes and edges files
# Update these according to your requirements
NODE_COLUMNS_TO_KEEP = ['id', 'name', 'category', 'xref', 'description', 'synonym', 'full_name', 'in_taxon_label']
EDGE_COLUMNS_TO_KEEP = ['subject', 'predicate', 'object', 'category']

In [None]:
# Function to prune TSV files
def prune_tsv_file(input_file, output_file, columns_to_keep):
    print(f"Pruning {input_file} to {output_file}...")
    
    # Read the TSV file
    df = pd.read_csv(input_file, sep='\t')
    
    # Keep only the specified columns that exist in the dataset
    columns_to_keep = [col for col in columns_to_keep if col in df.columns]
    df_pruned = df[columns_to_keep]
    
    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Save the pruned dataframe to a TSV file
    df_pruned.to_csv(output_file, sep='\t', index=False)
    
    print(f"Pruned file saved to {output_file}")

In [None]:
# Find and prune node and edge TSV files
nodes_file = None
edges_file = None

for file in os.listdir(EXTRACT_DIR):
    if 'nodes' in file.lower() and file.endswith('.tsv'):
        nodes_file = os.path.join(EXTRACT_DIR, file)
    elif 'edges' in file.lower() and file.endswith('.tsv'):
        edges_file = os.path.join(EXTRACT_DIR, file)

print(f"Found nodes file: {nodes_file}")
print(f"Found edges file: {edges_file}")

In [None]:
# Create the pruned directory
if os.path.exists(PRUNED_DIR):
    print(f"Directory {PRUNED_DIR} already exists. Removing it.")
    shutil.rmtree(PRUNED_DIR)

os.makedirs(PRUNED_DIR, exist_ok=True)

# Prune nodes and edges files if found
if nodes_file:
    pruned_nodes_file = os.path.join(PRUNED_DIR, os.path.basename(nodes_file))
    prune_tsv_file(nodes_file, pruned_nodes_file, NODE_COLUMNS_TO_KEEP)

if edges_file:
    pruned_edges_file = os.path.join(PRUNED_DIR, os.path.basename(edges_file))
    prune_tsv_file(edges_file, pruned_edges_file, EDGE_COLUMNS_TO_KEEP)

In [None]:
# Create a new tar.gz file with the pruned data
def create_tarfile(source_dir, output_file):
    print(f"Creating {output_file} from {source_dir}...")
    with tarfile.open(output_file, "w:gz") as tar:
        for file in os.listdir(source_dir):
            file_path = os.path.join(source_dir, file)
            tar.add(file_path, arcname=file)
    print(f"Created {output_file} successfully!")

In [None]:
# Create the final tar.gz file
create_tarfile(PRUNED_DIR, OUTPUT_FILE)

In [None]:
# Cleanup temporary files and directories
def cleanup():
    print("Cleaning up temporary files and directories...")
    if os.path.exists(EXTRACT_DIR):
        shutil.rmtree(EXTRACT_DIR)
    if os.path.exists(PRUNED_DIR):
        shutil.rmtree(PRUNED_DIR)
    print("Cleanup complete!")

In [None]:
# Uncomment to clean up
# cleanup()

## Summary

This notebook has:
1. Downloaded the KG-Alzheimers dataset
2. Extracted the tar.gz file
3. Pruned the nodes and edges TSV files to include only specified columns
4. Created a new tar.gz file with the pruned data

The pruned dataset is available at: `kg-alzheimers-pruned.tar.gz`