<a href="https://colab.research.google.com/github/Hiraeth-mist/muscle-aging-snakemake-scrnaseq-pipeline/blob/main/scRNAseq_aging_automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Connects your Colab session to your Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully!")

In [None]:
import os
import zipfile

# Define the path to your zipped pipeline in Google Drive
pipeline_zip_path = "/content/drive/MyDrive/RNAseq_AutoPipeline_Project/colab_pipeline.zip"

# Define the directory where you want to extract your pipeline files
# We will set our current working directory to this later.
colab_pipeline_root = "/content/colab_pipeline_env" # This will contain Snakefile, config etc.

# Create the extraction directory
os.makedirs(colab_pipeline_root, exist_ok=True)

# Unzip the pipeline file
print(f"Unzipping pipeline files from {pipeline_zip_path} to {colab_pipeline_root}...")
with zipfile.ZipFile(pipeline_zip_path, 'r') as zip_ref:
    # Assumes the zip contains a single top-level folder (e.g., 'my_colab_pipeline/')
    # We want to extract its *contents* directly into colab_pipeline_root
    # This loop handles cases where zip has a top-level dir or not.
    for member in zip_ref.namelist():
        # Get only the base name of the member (e.g., 'Snakefile', 'envs/', 'scripts/')
        member_path = os.path.join(colab_pipeline_root, os.path.relpath(member, zip_ref.namelist()[0].split(os.sep)[0]))
        if member_path == colab_pipeline_root: # Skip if it's the root directory itself
            continue
        if member.endswith('/'): # If it's a directory, create it
            os.makedirs(member_path, exist_ok=True)
        else: # If it's a file, extract it
            with open(member_path, 'wb') as outfile:
                outfile.write(zip_ref.read(member))

print(f"Pipeline files unzipped to: {colab_pipeline_root}")

# Verify contents (optional but recommended for debugging paths)
print("\nContents of pipeline root:")
!ls -F {colab_pipeline_root}
print("\nContents of scripts folder:")
!ls -F {colab_pipeline_root}/scripts

In [None]:
import os
import subprocess

# Define the base directory where your unzipped sample folders are
DATA_ROOT_DIR = "/content/drive/MyDrive/RNAseq_AutoPipeline_Project/data"

# List of your sample folders (ensure these match your actual folder names)
sample_folders = [
    "Muscle-Old-scRNAseq-rep1",
    "Muscle-Old-scRNAseq-rep2",
    "Muscle-Old-scRNAseq-rep3",
    "Muscle-Young-scRNAseq-rep1",
    "Muscle-Young-scRNAseq-rep2",
    "Muscle-Young-scRNAseq-rep3"
]

print("Gzipping data files...")
for sample_folder in sample_folders:
    sample_path = os.path.join(DATA_ROOT_DIR, sample_folder)

    # Check if files exist before gzipping (and if they are not already gzipped)
    files_to_compress = [
        os.path.join(sample_path, "barcodes.tsv"),
        os.path.join(sample_path, "features.tsv"),
        os.path.join(sample_path, "matrix.mtx")
    ]

    for file_path in files_to_compress:
        if os.path.exists(file_path) and not file_path.endswith('.gz'):
            print(f"  Compressing: {file_path}")
            # Use subprocess to run gzip command
            try:
                subprocess.run(f"gzip {file_path}", shell=True, check=True)
            except subprocess.CalledProcessError as e:
                print(f"Error compressing {file_path}: {e}")
                print(f"Output: {e.stdout.decode()} {e.stderr.decode()}")
        elif os.path.exists(file_path + '.gz'):
            print(f"  Already gzipped: {file_path}.gz")
        else:
            print(f"  File not found (or already gzipped): {file_path}")

print("Gzipping complete.")

# Optional: Verify new file names (should now end with .gz)
print("\nVerifying file names after gzipping (first sample):")
!ls -l {DATA_ROOT_DIR}/Muscle-Old-scRNAseq-rep1/

In [None]:
# Part 2, Step 2.4: Install Conda and Snakemake

# Install Miniconda
print("Installing Miniconda...")
!wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
# Install to /usr/local. Use -b for batch mode, -p for prefix.
!bash ./Miniconda3-latest-Linux-x86_64.sh -b -p /usr/local -f # -f forces if directory exists

# Conda's main executable is usually in /usr/local/bin or /usr/local/condabin
# We need to make sure this is in the PATH for ALL subsequent shell commands.
# This approach adds it to the current shell's PATH and makes it persistent for `!` commands.
import os
os.environ['PATH'] = "/usr/local/bin:/usr/local/condabin:" + os.environ['PATH']

# Also add site-packages to sys.path for Python to find packages installed by Conda
import sys
if '/usr/local/lib/python3.11/site-packages' not in sys.path:
    sys.path.append('/usr/local/lib/python3.11/site-packages')

# Now, initialize conda for the *current* shell session.
# This is typically what 'conda init bash' does and then 'source ~/.bashrc'
# The 'hook' is crucial.
!eval "$(conda shell.bash hook)"

# Explicitly activate the base environment (often helps Snakemake)
!conda activate base

# Install Snakemake using pip (after conda setup)
# We ensure pip uses the python from the conda environment (if activated)
print("Installing Snakemake...")
!pip install snakemake

# Verify installations
print("\nVerifying installations:")
!which conda # Should print /usr/local/bin/conda or /usr/local/condabin/conda
!conda --version
!snakemake --version

print("\nMiniconda and Snakemake installation attempt complete.")

# Install pandas for the main Snakemake execution environment
print("Installing pandas...")
!pip install pandas
print("pandas installed.")

In [None]:
# Accept Conda Terms of Service

print("Accepting Conda Terms of Service for required channels...")
# This command accepts the ToS for the 'main' channel
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main

# This command accepts the ToS for the 'r' channel (for R packages if needed, which bioconda often pulls from)
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

print("Conda Terms of Service accepted.")

In [None]:
import os

colab_pipeline_root = "/content/colab_pipeline_env" # Same as defined in Step 2.3
os.chdir(colab_pipeline_root)
print(f"Current working directory set to: {os.getcwd()}")

In [None]:
import os

output_dir_in_colab = "/content/colab_project_results"
os.makedirs(output_dir_in_colab, exist_ok=True)
print(f"Output directory created: {output_dir_in_colab}")

In [None]:
print("Starting Snakemake pipeline execution...")
# --use-conda: Tells Snakemake to manage environments using Conda.
# --cores 2: Use 2 CPU cores. Adjust if Colab offers more (e.g., --cores 4).
# --verbose: Provides more detailed output for debugging.
# --printshellcmds: Shows the actual shell commands being executed by each rule.
# --conda-frontend conda: Explicitly tells Snakemake to use the 'conda' command.
!snakemake --use-conda --cores 2 --verbose --printshellcmds --conda-frontend conda
print("\nSnakemake pipeline execution finished.")