In [None]:
import pandas as pd


def load_snf_list(txt_path):
    """
    Load SNF IDs from a text file. The text file should contain
    comma-separated SNF names (e.g., LJ1099, LJ1109,...).
    Returns a list of SNF names.
    """
    with open(txt_path, "r", encoding="utf-8") as file:
        content = file.read()
    # Split by comma and strip whitespace
    return [name.strip() for name in content.split(",") if name.strip()]


def filter_dataset_by_snf(df, snf_list):
    """
    Filter the input DataFrame and return only rows where the 'Name' column
    matches one of the SNF names in the list.
    """
    return df[df["Name"].isin(snf_list)].reset_index(drop=True)


def main():
    # Define file paths
    csv_path = "data/all_stdh_dataset.csv"
    snf_txt_path = "data/TSC01_SNFs.txt"

    # Load dataset and SNF list
    df = pd.read_csv(csv_path)
    snf_list = load_snf_list(snf_txt_path)

    # Filter dataset
    filtered_df = filter_dataset_by_snf(df, snf_list)

    # Optional: print result summary
    print(f"Original dataset size: {len(df)} rows")
    print(f"Filtered dataset size: {len(filtered_df)} rows")

    # Save filtered result if needed
    filtered_df.to_csv("all_stdh_dataset_tsc01.csv", index=False)


if __name__ == "__main__":
    main()

Original dataset size: 6870 rows
Filtered dataset size: 56 rows


In [None]:
import pandas as pd

df = pd.read_csv('data/all_stdh_dataset.csv')

with open('TSC01_SNFs.txt', 'r') as f:
    text = f.read()
names = [n.strip() for n in text.replace(',', '\n').splitlines() if n.strip()]


mapping = df.set_index('Name')['SNF_id']
snf_ids = mapping.loc[names].tolist()

print(snf_ids)
with open('TSC01_SNF_id.txt', 'w') as f:
    f.write(', '.join(snf_ids))
pd.DataFrame({'SNF_id': snf_ids}).to_csv('TSC01_SNF_id.csv', index=False, header=False)



# End 

['1A0016', '1A0026', '1A0009', '1A0003', '1A0066', '1A0098', '1A0319', '1A0103', '1A0198', '1A0329', '1A0407', '1A0028', '1A0272', '1A0119', '1B0548', '1B0506', '1B0513', '1A0238', '1A0248', '1A0007', '1A0406', '1A0089', '1B0541', '1B0501', '1B0547', '1A0311', '1A0273', '1A0012', '1A0013', '1A0338', '1A0088', '1B0430', '1B0475', '1B0516', '1A0306', '1A0190', '1A0015', '1A0014', '1A0072', '1B0500', '1B0514', '1B0542', '1A0297', '1A0361', '1A0360', '1A0138', '1A0149', '1A0373', '1A0385', '1A0218', '1A0129', '1A0010', '1A0068', '1A0109', '1A0011', '1A0018']


In [None]:
import pandas as pd
from pathlib import Path
project_root = Path.cwd().resolve().parents[1]
data_file = project_root / "data" / "test_files" / "all_stdh_dataset_tsc01.csv"
df = pd.read_csv(data_file)
type_map = {
    "GE88-1": "TypeA",
    "GE88-2": "TypeB",
    "Atrium10": "TypeC",
    "SPC88": "TypeD",
    "GE9B": "TypeE"
}

print("Original 'Type' values:")
print(df["Type"].unique(), "\n")

unknown = set(df["Type"].unique()) - set(type_map.keys())
if unknown:
    print(f"❗ Found unmapped Type values: {unknown}\n")
    print("Rows with unmapped Type values:")
    print(df[df["Type"].isin(unknown)])
    raise ValueError(f"Unmapped Type values found: {unknown}")

df["Type"] = df["Type"].replace(type_map)

df.to_csv(data_file, index=False)


Original 'Type' values:
['GE88-1' 'GE88-2'] 

