In [None]:
#TO CREATE A CSV FILE WITH SAMPLEID AS FILE_NAME PRESENT IN THE BLADDER FOLDER
import pandas as pd
import os

def load_tcga_gene_expression_long_format(base_download_dir, file_pattern=".tsv", value_column='fpkm_uq_unstranded'):
    """
    Loads gene expression data from multiple TCGA download folders into a single DataFrame
    in a 'long' format, including gene_name, gene_type, and the specified expression value.

    This modified version handles specific TCGA STAR-Counts TSV file formats:
    - Skips the initial comment line (e.g., # gene-model: GENCODE v36).
    - Reads the header from the second line of the file.
    - Filters out summary rows starting with 'N_' (e.g., N_unmapped).
    - Extracts Sample_ID from the filename by removing the specified pattern.

    Args:
        base_download_dir (str): The path to the main directory where all the
                                 individual sample folders are located.
        file_pattern (str): A string that helps identify the specific gene
                            expression file within each sample's folder (e.g., '.rna_seq.augmented_star_gene_counts.tsv').
        value_column (str): The name of the column in the TSV file that contains
                            the gene expression values (e.g., 'fpkm_uq_unstranded').

    Returns:
        pandas.DataFrame: A DataFrame in long format where each row represents
                          a single gene's expression in a single sample, with
                          columns for sample ID, gene_name, gene_type, gene_id,
                          and the specified expression value. Returns None
                          if no files are found or an error occurs.
    """
    all_sample_gene_data = []

    print(f"Starting to load data from: {base_download_dir}")

    # Expected column names for verification
    required_columns = ['gene_id', 'gene_name', 'gene_type', value_column]

    if not os.path.isdir(base_download_dir):
        print(f"Error: Base download directory '{base_download_dir}' not found.")
        return None

    # Iterate through each sample's folder
    # Assuming each subfolder contains the .tsv file.
    # The sample ID will now be derived from the file name itself.
    for sample_folder_name in os.listdir(base_download_dir):
        sample_folder_path = os.path.join(base_download_dir, sample_folder_name)

        if os.path.isdir(sample_folder_path):
            print(f"Processing folder: {sample_folder_name}") # Still processing folder, but ID comes from file
            gene_expression_file_found = False

            for file_name in os.listdir(sample_folder_path):
                if file_name.endswith(file_pattern):
                    file_path = os.path.join(sample_folder_path, file_name)
                    try:
                        # Read the TSV file, skipping the first line (comment) and using the second as header
                        df = pd.read_csv(file_path, sep='\t', skiprows=1, header=0)

                        # Check if all required columns are present
                        if not all(col in df.columns for col in required_columns):
                            missing_cols = [col for col in required_columns if col not in df.columns]
                            raise ValueError(f"Missing required columns in file {file_name}: {missing_cols}. Available: {df.columns.tolist()}")

                        # Filter out the 'N_' rows (summary statistics)
                        df = df[~df['gene_id'].astype(str).str.startswith('N_')]

                        # Select only the desired columns
                        df_subset = df[['gene_id', 'gene_name', 'gene_type', value_column]].copy()

                        # --- MODIFIED: Extract Sample_ID from the file_name ---
                        sample_id = file_name.replace(file_pattern, '')
                        df_subset['Sample_ID'] = sample_id
                        # --- END MODIFIED ---

                        all_sample_gene_data.append(df_subset)

                        gene_expression_file_found = True
                        print(f"  - Loaded {file_name} and extracted required columns with Sample_ID: {sample_id}.")
                        break # Found the file, move to next sample folder
                    except Exception as e:
                        print(f"  - Error loading or processing {file_name} in {sample_folder_name}: {e}")
            if not gene_expression_file_found:
                print(f"  - No gene expression file matching '{file_pattern}' found in {sample_folder_path}")

    if not all_sample_gene_data:
        print("No gene expression data was loaded. Please check your path, file patterns, and column names.")
        return None

    # Concatenate all individual sample DataFrames into one large DataFrame
    final_df_long = pd.concat(all_sample_gene_data, ignore_index=True)

    # Reorder columns as requested: Sample_ID, gene_name, gene_type, gene_id, fpkm_uq_unstranded
    final_df_long = final_df_long[['Sample_ID', 'gene_name', 'gene_type', 'gene_id', value_column]]

    print("\nData loading complete. Combined into long format DataFrame.")
    print(f"Final DataFrame shape: {final_df_long.shape}")
    return final_df_long

# --- Main execution block ---

# IMPORTANT:
# 1. REPLACE 'path/to/your/folder_with_tsv_files' with the actual path to your main
#    TCGA download directory on YOUR computer.
#    This directory should contain the individual sample
#    folders (e.g., '001a2b3c-d4e5-f6a7-b8c9-d0e1f2a3b4c5'), and inside each folder
#    there should be the .tsv file.
DOWNLOAD_BASE_DIR = 'C:/Users/mohak/Desktop/Bladder' # <--- !!! YOU MUST CHANGE THIS PATH !!!

# This is the full suffix that will be removed from the filename to get the Sample_ID
FILE_IDENTIFIER = '.rna_seq.augmented_star_gene_counts.tsv' # Common for TCGA STAR-Counts. Change if different.

VALUE_COLUMN_TO_USE = 'fpkm_uq_unstranded' # This is the requested expression value

# Load the gene expression data in long format
gene_expression_long_df = load_tcga_gene_expression_long_format(
    base_download_dir=DOWNLOAD_BASE_DIR,
    file_pattern=FILE_IDENTIFIER,
    value_column=VALUE_COLUMN_TO_USE
)

if gene_expression_long_df is not None:
    print("\nFirst 10 rows of the loaded data in long format:")
    print(gene_expression_long_df.head(10))

    # --- Saving the DataFrame to a CSV file ---

    # --- THIS IS THE PART YOU NEED TO CHANGE FOR YOUR DESKTOP ---
    # To save to your desktop, uncomment and adjust one of these examples:

    # Example for Windows Desktop:
    desktop_path = 'C:/Users/mohak/Desktop' # <--- !!! REPLACE 'YourUsername' WITH YOUR ACTUAL USERNAME !!!
    output_csv_file_long = os.path.join(desktop_path, 'TCGA_Combined_GeneExpression_Data.csv')

    # Example for macOS/Linux Desktop:
    # desktop_path = '/Users/YourUsername/Desktop' # <--- !!! REPLACE 'YourUsername' WITH YOUR ACTUAL USERNAME !!!
    # output_csv_file_long = os.path.join(desktop_path, 'TCGA_Combined_GeneExpression_Data.csv')


    # Save the DataFrame to CSV.
    gene_expression_long_df.to_csv(output_csv_file_long, index=False)

    print(f"\nCombined gene expression data in long format successfully saved to: {output_csv_file_long}")
    print("\n**Important:** This message indicates the file *would* be saved if you run this script on your local machine.")
    print("Please check the specified output path (or the script's directory) on your computer for the file.")

else:
    print("\nData loading failed. No CSV file will be generated.")

In [None]:
#SEARCH ROWS WITH PARTICULAR SAMPLEID
import pandas as pd

# --- YOU MUST CHANGE THESE TWO LINES ---
# 1. Path to your combined CSV file (e.g., 'C:/Users/YourUsername/Desktop/TCGA_Combined_GeneExpression_Data.csv')
COMBINED_CSV_FILE_PATH = 'C:/Users/mohak/Desktop/TCGA-BLCA.csv'

# 2. The specific Sample_ID you want to search for
TARGET_SAMPLE_ID = 'c3b6e9cc-9c7e-4740-bf0e-022338d82160'
# --- END OF LINES TO CHANGE ---


try:
    df = pd.read_csv(COMBINED_CSV_FILE_PATH)
    print(f"Loaded '{COMBINED_CSV_FILE_PATH}'.")

    if 'Sample_ID' not in df.columns:
        print(f"Error: 'Sample_ID' column not found in the file. Available columns: {df.columns.tolist()}")
    else:
        found_rows = df[df['Sample_ID'] == TARGET_SAMPLE_ID]

        if not found_rows.empty:
            print(f"\n--- Data found for Sample_ID: '{TARGET_SAMPLE_ID}' ---")
            print(found_rows.to_string())
            print(f"\nTotal rows found: {len(found_rows)}")
        else:
            print(f"\nNo data found for Sample_ID: '{TARGET_SAMPLE_ID}'.")

except FileNotFoundError:
    print(f"Error: File not found at '{COMBINED_CSV_FILE_PATH}'. Please check the path.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
a = pd.read_csv("C:/Users/mohak/Desktop/TCGA.csv")
a.head(10)

In [None]:
#FILTERS ENTRIES IN TCGA-BLCA DATASET ACCORDING TO THOSE PRESENT IN THE ARRANGED_TABLE ONE
import pandas as pd
import os

# --- IMPORTANT: YOU MUST CHANGE THESE FILE PATHS ---
# Path to your TCGA-BLCA.csv file (the combined gene expression data)
tcga_blca_file_path = 'C:/Users/mohak/Desktop/TCGA-BLCA.csv' # <--- !!! CHANGE THIS PATH !!!

# Path to your Arranged Table.xlsx - Sheet1.csv file
arranged_table_file_path = 'C:/Users/mohak/Desktop/Arranged Table.xlsx' # <--- !!! CHANGE THIS PATH !!!

# Desired name for the new filtered output file
output_file_name = 'TCGA_BLCA_Filtered_By_FilenameMatch.csv'

# --- OPTIONAL: If you want to save the output file to your desktop ---
desktop_path = 'C:/Users/mohak/Desktop' # <--- !!! REPLACE 'YourUsername' WITH YOUR ACTUAL USERNAME !!!
output_full_path = os.path.join(desktop_path, output_file_name)
# If you don't uncomment/use desktop_path, the file will be saved in the same directory as your script.
# --- END OF PATH CHANGES ---

# Define the suffix to remove from the File_name column in Arranged Table
FILENAME_SUFFIX_TO_REMOVE = '.rna_seq.augmented_star_gene_counts.tsv'

# --- Encoding fallback (use 'latin1' or 'cp1252' if you faced previous encoding errors) ---
encoding_to_use = 'utf-8' # Try 'latin1' or 'cp1252' if you had encoding issues


try:
    # 1. Load the TCGA-BLCA file
    df_tcga_blca = pd.read_csv(tcga_blca_file_path, encoding=encoding_to_use)
    print(f"Successfully loaded '{tcga_blca_file_path}'. Shape: {df_tcga_blca.shape}")

    # 2. Load the Arranged Table file
    df_arranged = pd.read_excel(arranged_table_file_path)
    print(f"Successfully loaded '{arranged_table_file_path}'. Shape: {df_arranged.shape}")

    # --- Verify necessary columns exist ---
    if 'Sample_ID' not in df_tcga_blca.columns:
        raise ValueError(f"Error: 'Sample_ID' column not found in '{tcga_blca_file_path}'.")
    if 'File_name' not in df_arranged.columns:
        raise ValueError(f"Error: 'File_name' column not found in '{arranged_table_file_path}'. This column is needed for matching.")

    # --- Process File_name in Arranged Table to get base IDs ---
    # Convert to string to handle potential non-string types and remove suffix
    # Also strip whitespace in case there's any around the filename
    df_arranged['Processed_File_ID'] = df_arranged['File_name'].astype(str).str.replace(FILENAME_SUFFIX_TO_REMOVE, '', regex=False).str.strip()

    # Get the unique list of these processed IDs from the Arranged Table
    ids_to_keep = df_arranged['Processed_File_ID'].dropna().unique()
    print(f"Found {len(ids_to_keep)} unique processed File_IDs in '{arranged_table_file_path}' to use for filtering.")

    # --- Clean Sample_ID in TCGA-BLCA for robust matching (remove any stray whitespace) ---
    df_tcga_blca['Sample_ID_Cleaned'] = df_tcga_blca['Sample_ID'].astype(str).str.strip()


    # 3. Filter the TCGA-BLCA DataFrame
    # Keep only rows where 'Sample_ID_Cleaned' is in the 'ids_to_keep' list
    df_filtered_tcga_blca = df_tcga_blca[df_tcga_blca['Sample_ID_Cleaned'].isin(ids_to_keep)].copy()

    # Remove the temporary cleaned column before saving
    df_filtered_tcga_blca.drop(columns=['Sample_ID_Cleaned'], inplace=True)


    print(f"\nFiltered DataFrame created. Original TCGA-BLCA rows: {df_tcga_blca.shape[0]}, Filtered rows: {df_filtered_tcga_blca.shape[0]}")
    print("\nFirst 5 rows of the new filtered DataFrame:")
    print(df_filtered_tcga_blca.head().to_string())


    # 4. Save the new filtered CSV file
    df_filtered_tcga_blca.to_csv(output_full_path, index=False)
    print(f"\nFiltered data successfully saved to: '{output_full_path}'")
    print("Please check this path on your computer for the new file.")

except FileNotFoundError as e:
    print(f"Error: One of the input files was not found. Please check the paths. {e}")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")