In [1]:
import pandas as pd
import numpy as np

# Cleaning CGGA/TCGA RNA-seq Expression Matrix

file_path = "CGGA_RNAseq.txt"
expression_df = pd.read_csv("CGGA_data/CGGA_RNAseq.txt", sep="\t")

print(f"Original matrix shape: {expression_df.shape}")
print(f"Number of genes: {expression_df.shape[0]}")
print(f"Number of samples: {expression_df.shape[1] - 1}")

missing_indicators = [
    'not reported', 'unknown', 'not available', 'N/A', 'NA', '--', 'null', ' '
]
expression_df.replace(missing_indicators, np.nan, inplace=True)

null_count = expression_df.isnull().sum().sum()
print(f"\nTotal missing values found: {null_count}")

duplicate_genes = expression_df['sample'].duplicated().sum()
if duplicate_genes > 0:
    print(f"Found {duplicate_genes} duplicate gene names. Keeping the first occurrence.")
    expression_df = expression_df.drop_duplicates(subset=['sample'], keep='first')

expression_df.set_index('sample', inplace=True)

threshold = 0.1 * len(expression_df.columns)
expression_df = expression_df[(expression_df > 0).sum(axis=1) >= threshold]

print(f"Shape after filtering low-expression genes: {expression_df.shape}")

expression_df = expression_df.apply(pd.to_numeric, errors='coerce')

output_path = 'cleaned_RNAseq_data.csv'
expression_df.to_csv(output_path)

print(f"\nFinal cleaned shape: {expression_df.shape}")
print(f"Cleaned data saved to: {output_path}")

Original matrix shape: (20530, 703)
Number of genes: 20530
Number of samples: 702

Total missing values found: 0
Shape after filtering low-expression genes: (19091, 702)

Final cleaned shape: (19091, 702)
Cleaned data saved to: cleaned_RNAseq_data.csv
