In [7]:
import pandas as pd

# Load the mapping file
mapping_df = pd.read_excel("models/BRCA/ensemble_mirbase_mapping.xlsx")

# Load the KMG expression file
kmg_df = pd.read_csv("models/BRCA/20250327_rpm_KMG0071.csv")

# Merge based on the ensemble name
merged_df = pd.merge(mapping_df, kmg_df, left_on='ensemble', right_on='miRNA')

# Replace 'ensemble' name with 'miRbase' and keep only relevant columns
final_df = merged_df[['miRbase', 'KMG0071_R1']].rename(columns={'miRbase': 'miRNA'})

# Save to a new file
final_df.to_csv("models/BRCA/KMG0071_mapped.csv", index=False)


In [8]:
import pandas as pd

# Load the mapping file
mapping_df = pd.read_excel("models/BRCA/ensemble_mirbase_mapping.xlsx")

# Load the multi-sample KMG expression matrix
expr_df = pd.read_csv("models/BRCA/20250327_rpm_matrix.csv")

# Merge on 'ensemble' = 'miRNA' in expr_df
# First, ensure column names match for merging
merged_df = pd.merge(mapping_df, expr_df, left_on='ensemble', right_on='miRNA')

# Replace 'ensemble' name with 'miRbase' and drop the original miRNA column
# Keep all sample columns
final_df = merged_df.drop(columns=['ensemble', 'miRNA'])
final_df = final_df.rename(columns={'miRbase': 'miRNA'})

# Reorder columns so that 'miRNA' is first
cols = ['miRNA'] + [col for col in final_df.columns if col != 'miRNA']
final_df = final_df[cols]

# Save the final output
final_df.to_csv("models/BRCA/20250327_rpm_matrix_mapped.csv", index=False)


In [None]:
import pandas as pd
import re
import os

# Set the folder path
folder = "models/BRCA"

# File paths
expr_file = os.path.join(folder, "20250327_rpm_matrix2.csv")
meta_file = os.path.join(folder, "AVITI data_updated 20241212-Haziq - 20250327miRNA.csv")
output_file = os.path.join(folder, "20250327_rpm_matrix2_with_cancer_types.csv")

# Load expression matrix
expr_df = pd.read_csv(expr_file)

# Load metadata with sep auto-detection
meta_df = pd.read_csv(meta_file, sep=None, engine='python')

# Clean and normalize column names
meta_df.columns = [col.strip() for col in meta_df.columns]

# Debug: print column names if needed
# print("Columns in metadata:", meta_df.columns.tolist())

# Extract only English cancer type names
def extract_cancer_type(row):
    cancer_info = row.get("Type of Cancer", "")
    if pd.isna(cancer_info):
        return "-"
    return cancer_info.split('-')[-1].strip()

meta_df["Cancer_Type"] = meta_df.apply(extract_cancer_type, axis=1)

# Create mapping: sample ID (like KMG0025) → Cancer Type (like Liver cancer)
sample_to_cancer = dict(zip(meta_df["Sample number"], meta_df["Cancer_Type"]))

# Build new row: Cancer Type
cancer_row = ["Cancer Type"]
for col in expr_df.columns[1:]:
    match = re.match(r"(KMG\d+)", col)
    if match:
        sample = match.group(1)
        cancer_type = sample_to_cancer.get(sample, "Unknown")
        cancer_row.append(cancer_type)
    else:
        cancer_row.append("Unknown")

# Insert the new row under the header
new_header = pd.DataFrame([expr_df.columns])
cancer_type_row = pd.DataFrame([cancer_row])
rest_of_data = expr_df.copy()
rest_of_data.columns = range(len(rest_of_data.columns))  # temporarily remove column names

# Combine and save
final_df = pd.concat([new_header, cancer_type_row, rest_of_data], ignore_index=True)
final_df.to_csv(output_file, index=False, header=False)

print(f"✅ Success! New file saved at: {output_file}")


ParserError: 'I' expected after '"'

In [11]:
df = pd.read_csv("models/BRCA/AVITI data_updated 20241212-Haziq - 20250327miRNA.csv")

In [12]:
df

Unnamed: 0,AVITI\nNumber,Sample number,Type of Cancer,Sex,Age,Library Qbit concentration（ng/ul）,Number of Lead count,Haziq\nMapping Ratio (%),TapeStation Peak\n\n,Notification,リード数\n10万以上,Unnamed: 11
0,1.0,PhiX,-,-,-,-,,,,,,
1,2.0,KMG0025,肝臓がん-Liver cancer,M,51,7.4,267995,34.97,,,KMG0025,肝臓がん-Liver cancer
2,3.0,KMG0037,健常者-Non-disabled person,F,26,0.838,1823269,76.75,,,KMG0037,健常者-Non-disabled person
3,4.0,KMG0038,健常者-Non-disabled person,F,45,10.6,2079280,69.84,,,KMG0038,健常者-Non-disabled person
4,5.0,KMG0039,健常者-Non-disabled person,F,43,10.3,2777926,74.48,,,KMG0039,健常者-Non-disabled person
...,...,...,...,...,...,...,...,...,...,...,...,...
245,246.0,KMG0192_2,"術後フォロー-Post-operative follow-up, 肺がん-Lung cancer",,,0.47,58831,16.56,re-inspection,,,
246,247.0,KMG0315,神経内分泌腫瘍-Neuroendocrine neoplasm,M,68,0.214,399046,76.82,,,KMG0315,神経内分泌腫瘍-Neuroendocrine neoplasm
247,248.0,KMG0316,膵がん-Pancreatic cancer,F,76,1.77,77861,69.61,re-inspection,,,
248,249.0,KMG0317,胃がん-Gastric cancer,M,80,0.738,66210,32.10,re-inspection,,,
