<a href="https://colab.research.google.com/github/MostaryKhatun/AdvanceCriptography/blob/main/stressDataPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 STEP 1: Upload the Files

In [None]:
from google.colab import files
uploaded = files.upload()


In [3]:
import os
import csv
import numpy as np
import pandas as pd


Covert test to csv

In [4]:
def convert_txt_to_csv(input_file, output_file, delimiter='\t'):
    """
    Converts a text file to a CSV file.

    Parameters:
    - input_file (str): Path to the input text file.
    - output_file (str): Path to the output CSV file.
    - delimiter (str): Delimiter used in the input text file (default is tab-delimited).
    """
    try:
        with open(input_file, 'r', newline='') as infile:
            reader = csv.reader(infile, delimiter=delimiter)
            with open(output_file, 'w', newline='') as outfile:
                writer = csv.writer(outfile)
                for row in reader:
                    writer.writerow(row)
        print(f"File '{input_file}' has been successfully converted to '{output_file}'.")
    except Exception as e:
        print(f"An error occurred while converting '{input_file}': {e}")

copy adj.p.value gene

In [5]:
def merge_csv_files(
    first_csv_path,
    second_csv_path,
    output_csv_path,
    first_key_column,
    second_key_column,
    columns_to_copy,
    delimiter=','
):
    # Check if input files exist
    if not os.path.isfile(first_csv_path):
        print(f"Error: The file '{first_csv_path}' does not exist.")
        return
    if not os.path.isfile(second_csv_path):
        print(f"Error: The file '{second_csv_path}' does not exist.")
        return

    # Read the first CSV and create a mapping from key to desired columns
    first_data = {}
    try:
        with open(first_csv_path, 'r', newline='', encoding='utf-8') as first_file:
            reader = csv.DictReader(first_file, delimiter=delimiter)
            if first_key_column not in reader.fieldnames:
                print(f"Error: '{first_key_column}' not found in the first CSV file headers.")
                return
            for row in reader:
                key = row.get(first_key_column)
                if key is None:
                    continue
                extracted = {col: row.get(col, "") for col in columns_to_copy}
                first_data[key] = extracted
    except Exception as e:
        print(f"Error reading '{first_csv_path}': {e}")
        return

    # Read the second CSV and merge data
    try:
        with open(second_csv_path, 'r', newline='', encoding='utf-8') as second_file, \
             open(output_csv_path, 'w', newline='', encoding='utf-8') as output_file:

            reader = csv.DictReader(second_file, delimiter=delimiter)
            second_fieldnames = reader.fieldnames
            if second_fieldnames is None:
                print(f"Error: The second CSV file '{second_csv_path}' is empty or missing headers.")
                return
            if second_key_column not in second_fieldnames:
                print(f"Error: '{second_key_column}' not found in the second CSV file headers.")
                return

            # Insert new columns after the first column
            insert_position = 1
            new_fieldnames = second_fieldnames[:insert_position] + columns_to_copy + second_fieldnames[insert_position:]
            writer = csv.DictWriter(output_file, fieldnames=new_fieldnames, delimiter=delimiter)
            writer.writeheader()

            for row in reader:
                key = row.get(second_key_column)
                if key is None:
                    writer.writerow(row)
                    continue

                additional_data = first_data.get(key, {col: "" for col in columns_to_copy})
                new_row = {**row}
                # Insert additional_data after the first column
                for col in columns_to_copy:
                    new_row[col] = additional_data.get(col, "")
                # Ensure the order of columns
                ordered_row = {field: new_row.get(field, "") for field in new_fieldnames}
                writer.writerow(ordered_row)

    except Exception as e:
        print(f"Error processing files: {e}")
        return

    print(f"Files have been successfully merged into '{output_csv_path}'.")

perform system biology:Gene regularator network

In [6]:
def perform_analysis(file_path, output_cleaned_file, report_file):
    """
    Performs data analysis on the given CSV file.

    Parameters:
    - file_path (str): Path to the input CSV file.
    - output_cleaned_file (str): Path to save the cleaned filtered data.
    - report_file (str): Path to save the analysis report.
    """
    data = pd.read_csv(file_path)

    # Get the number of columns and rows
    num_columns = data.shape[1]
    num_rows = data.shape[0]

    # Print column names
    column_names = data.columns.tolist()
    print("Column Names:")
    for col in column_names:
        print(f"- {col}")

    # Take column name and threshold as input from the user
    column_name = input("Enter the column name for analysis: ").strip()
    if column_name not in column_names:
        print(f"Error: Column '{column_name}' does not exist in the data.")
        return

    # Calculate statistics for the column
    mean_value = data[column_name].mean()
    median_value = data[column_name].median()
    mode_value = data[column_name].mode()[0] if not data[column_name].mode().empty else None
    std_dev = data[column_name].std()
    min_value = data[column_name].min()
    max_value = data[column_name].max()

    print(f"Mean: {mean_value}\nMax: {max_value}")

    try:
        threshold = float(input("Enter the threshold value: ").strip())
    except ValueError:
        print("Invalid threshold value.")
        return

    # Filter the data based on the input threshold
    filtered_data = data[data[column_name] < threshold]
    filtered_num_rows = filtered_data.shape[0]

    # Identify and remove rows with any blank (NaN) values
    blank_rows = filtered_data[filtered_data.isna().any(axis=1)].shape[0]
    filtered_data_cleaned = filtered_data.dropna()

    # Save the cleaned filtered data to a new CSV file
    filtered_data_cleaned.to_csv(output_cleaned_file, index=False)

    # Generate a report and save it to a text file
    report = f"""
CSV File Analysis Report
------------------------

Number of Columns: {num_columns}
Number of Rows: {num_rows}

Column Names:
{', '.join(column_names)}

Selected Column: {column_name}

Statistics for {column_name}:
- Mean: {mean_value}
- Median: {median_value}
- Mode: {mode_value}
- Standard Deviation: {std_dev}
- Minimum: {min_value}
- Maximum: {max_value}

Filtering Data:
- Threshold: {threshold}
- Rows after filtering: {filtered_num_rows}

Blank Rows:
- Number of Rows with Blanks: {blank_rows}
- Total Rows after removing blank rows: {filtered_data_cleaned.shape[0]}

Cleaned Filtered Data saved to: {output_cleaned_file}
"""
    with open(report_file, 'w') as report_f:
        report_f.write(report)

    print(f"Report generated and saved to '{report_file}'")
    print(f"Cleaned data saved to '{output_cleaned_file}'")

Transpose Matrix

In [7]:
def transpose_data(input_file, output_file):
    """
    Transposes the data from the input CSV file after removing the first three columns.

    Parameters:
    - input_file (str): Path to the input CSV file.
    - output_file (str): Path to save the transposed data.
    """
    data = pd.read_csv(input_file)
    # Remove the first three columns
    data = data.iloc[:, 3:]
    # Transpose the DataFrame
    transposed_data = data.T
    # Save the transposed DataFrame
    transposed_data.to_csv(output_file, header=False, index=True)
    print(f"Transposed data saved to '{output_file}'")

Adding the lavel

In [8]:
def add_target_column(transposed_file, target_file, output_file):
    """
    Adds a 'target' column to the transposed data based on the target genes.

    Parameters:
    - transposed_file (str): Path to the transposed CSV file.
    - target_file (str): Path to the CSV file containing target genes.
    - output_file (str): Path to save the labeled data.
    """
    # Load the transposed data
    df_transposed = pd.read_csv(transposed_file)
    # Load the target genes
    target_genes = pd.read_csv(target_file, header=None, names=['Gene.symbol'])['Gene.symbol'].str.strip()
    # Add a 'target' column initialized to 1
    df_transposed['target'] = 1
    # Remove leading/trailing spaces from 'Gene.symbol'
    df_transposed['Gene.symbol'] = df_transposed['Gene.symbol'].str.strip()
    # Update 'target' to 0 where 'Gene.symbol' matches any value in target_genes
    df_transposed.loc[df_transposed['Gene.symbol'].isin(target_genes), 'target'] = 0
    # Remove the 'Gene.symbol' column
    df_transposed = df_transposed.iloc[:, 1:]
    # Save the updated DataFrame
    df_transposed.to_csv(output_file, index=False)
    print(f"Labeled data saved to '{output_file}'")

main

In [9]:
# Dynamic dataset name
dataset_name = input("Enter the dataset name (e.g., 'GSE62792'): ").strip()

Enter the dataset name (e.g., 'GSE62792'): GSE20966


In [10]:
# File paths
deg_input_file = f'{dataset_name}_deg.txt'
metrix_input_file = f'{dataset_name}_series_matrix.txt'
deg_output_file = f'{dataset_name}_deg.csv'
metrix_output_file = f'{dataset_name}_series_matrix.csv'
merged_output_file = f'{dataset_name}_series_matrix_with_pvalues.csv'
cleaned_file_path = f'filtered_data_{dataset_name}.csv'
report_file_path = f'analysis_report_{dataset_name}.txt'
transposed_file = f'{dataset_name}_matrix_Transpose.csv'
labeled_output_file = f'{dataset_name}_matrix_Transpose_labeled.csv'
target_file = 'target.csv'  # Ensure this file exists in your working directory

In [11]:
# Step 1: Convert text files to CSV
convert_txt_to_csv(deg_input_file, deg_output_file)
convert_txt_to_csv(metrix_input_file, metrix_output_file)

File 'GSE20966_deg.txt' has been successfully converted to 'GSE20966_deg.csv'.
File 'GSE20966_series_matrix.txt' has been successfully converted to 'GSE20966_series_matrix.csv'.


In [12]:
# Step 2: Merge CSV files
merge_csv_files(
    first_csv_path=deg_output_file,
    second_csv_path=metrix_output_file,
    output_csv_path=merged_output_file,
    first_key_column='ID',
    second_key_column='ID_REF',
    columns_to_copy=['adj.P.Val', 'P.Value', 'Gene.symbol'],
    delimiter=','
)

Error: 'ID' not found in the first CSV file headers.


In [None]:
# Step 3: Perform data analysis
perform_analysis(
    file_path=merged_output_file,
    output_cleaned_file=cleaned_file_path,
    report_file=report_file_path
)

EmptyDataError: No columns to parse from file

In [None]:
# Step 4: Transpose the data
transpose_data(
    input_file=cleaned_file_path,
    output_file=transposed_file
)

Transposed data saved to 'GSE63878_matrix_Transpose.csv'


In [None]:
# Step 5: Add the target column
add_target_column(
    transposed_file=transposed_file,
    target_file=target_file,
    output_file=labeled_output_file
)

Labeled data saved to 'GSE63878_matrix_Transpose_labeled.csv'
