# PSI Metadata Cleaning Notebook

This notebook processes PSI-166 ISA-Tab metadata into a standardized CSV format (`psi_metadata.csv`) for integration with MP and G-Space datasets.

# 📦 2. Unzip the ISA-Tab Package

In [13]:
pwd

'/home/nanohub/q9/MP-PSI_Project/notebooks'

In [2]:
import zipfile
import os

zip_path = "/home/nanohub/q9/MP-PSI_Project/data_raw/psi/PSI-166_metadata_PSI-166-ISA.zip"
extract_path = "/home/nanohub/q9/MP-PSI_Project/data_raw/psi/psi_isa_extracted"

# Create folder if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Unzip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Unzipped to:", extract_path)

✅ Unzipped to: /home/nanohub/q9/MP-PSI_Project/data_raw/psi/psi_isa_extracted


In [4]:
import pandas as pd

In [6]:
# Load the ISA sample file (tab-delimited)
sample_file = "/home/nanohub/q9/MP-PSI_Project/data_raw/psi/psi_isa_extracted/s_PSI-166.txt"  # Adjust path if needed
df = pd.read_csv(sample_file, sep='\t', dtype=str)

# Preview the first few rows

In [9]:
print(df.head())

       Source Name      Sample Name
0  Do not use this  Do not use this


# Save as CSV

In [10]:
csv_output = "PSI-166_sample_metadata.csv"
df.to_csv(csv_output, index=False)

print(f"✅ CSV saved as: {csv_output}")

✅ CSV saved as: PSI-166_sample_metadata.csv


# Get and print the absolute path

In [12]:
full_path = os.path.abspath(csv_output)
print(f"📁 Full path to CSV: {full_path}")

📁 Full path to CSV: /home/nanohub/q9/MP-PSI_Project/notebooks/PSI-166_sample_metadata.csv


# 📄 3. Load Study File

In [14]:
import pandas as pd
psi_study = pd.read_csv(full_path, sep="\t")

print("✅ Loaded study file with shape:", psi_study.shape)
psi_study.head()

✅ Loaded study file with shape: (1, 1)


Unnamed: 0,"Source Name,Sample Name"
0,"Do not use this,Do not use this"


# 🧹 4. Clean Column Names

In [None]:
# Simplify verbose ISA-Tab headers
psi_study.columns = [col.replace("Characteristics[", "").replace("Factor Value[", "").replace("]", "").strip() for col in psi_study.columns]

psi_study.head()

# 🧬 5. Rename Columns to Match Schema

In [None]:
# Rename to match expected schema for merging
psi_study.rename(columns={
    "Material": "material_id",
    "Temperature": "temperature",
    "Gravity": "gravity_level"
}, inplace=True)

psi_study.head()

# 📤 6. Export Cleaned Metadata

In [None]:
output_path = "data_processed/psi_metadata.csv"
os.makedirs("data_processed", exist_ok=True)

psi_study.to_csv(output_path, index=False)
print("✅ Saved cleaned metadata to:", output_path)

# 🧠 7. Optional: Validate Required Columns

In [None]:
required_cols = ["material_id", "temperature", "gravity_level"]
missing = [col for col in required_cols if col not in psi_study.columns]

if missing:
    print("⚠️ Missing columns:", missing)
else:
    print("✅ All required columns present")