## Extracting files

In [1]:
# Replace 'GSE98969_RAW.tar' with the path to your file
file_path = 'GSE98969_RAW.tar'

In [2]:
import tarfile


# Open the tar file
with tarfile.open(file_path, 'r') as tar:
    # Extract all files to the current directory or specify a path
    tar.extractall(path= file_path +'_extracted_files')

print("Extraction completed.")


Extraction completed.


In [127]:
import os
import gzip
import shutil
import pandas as pd

In [4]:
# !pip uninstall scanpy -y
# !pip install -q scanpy
# !pip install -q python-igraph
!pip install -q louvain

In [5]:
import pandas as pd
import os 
# Load metadata file (e.g., downloaded as 'sra_metadata.csv')
file_name = file_path.split('_')[0]
metadata_path = file_name + "_SraRunTable.csv"
metadata = pd.read_csv(metadata_path)  # Adjust delimiter if necessary, e.g., `sep="\t"` for TSV

# Preview the metadata
print(metadata.head())


          Run Assay Type  AvgSpotLen       Bases   BioProject     BioSample  \
0  SRR5570818    RNA-Seq          66  1787012832  PRJNA387079  SAMN07134144   
1  SRR5570819    RNA-Seq          66  1754542416  PRJNA387079  SAMN07134143   
2  SRR5570820    RNA-Seq          66  2025254748  PRJNA387079  SAMN07134095   
3  SRR5570821    RNA-Seq          66  1903373934  PRJNA387079  SAMN07134112   
4  SRR5570822    RNA-Seq          66  1897330314  PRJNA387079  SAMN07134111   

        Bytes Center Name Consent DATASTORE filetype  ...  Platform  \
0  2837501765         GEO  public   fastq,run.zq,sra  ...  ILLUMINA   
1  2787916655         GEO  public   fastq,run.zq,sra  ...  ILLUMINA   
2  3215106227         GEO  public   fastq,run.zq,sra  ...  ILLUMINA   
3  3023441918         GEO  public   fastq,run.zq,sra  ...  ILLUMINA   
4  3016457388         GEO  public   fastq,run.zq,sra  ...  ILLUMINA   

            ReleaseDate           create_date version Sample Name  \
0  2017-06-12T00:00:00Z  2017

In [29]:
metadata = metadata.drop(['chip_antibody', 'enrichment', 'tissue'], axis=1)

KeyError: "['chip_antibody', 'enrichment', 'tissue'] not found in axis"

In [50]:
# Prepare metadata for matching
metadata_filtered = metadata.set_index("Sample Name")

In [49]:
import os
import pandas as pd
import concurrent.futures
import gzip

# Define the directory containing the extracted .gz files
gz_directory = file_path + '_extracted_files'

# List all .gz files in the directory
files = [f for f in os.listdir(gz_directory) if f.endswith('.gz')]

def process_file(file_name):
    """Reads the extracted .txt file corresponding to the .gz file and returns metadata and DataFrame."""
    try:
        print(f"Processing: {file_name}")
        gz_path = os.path.join(gz_directory, file_name)
        txt_file = os.path.splitext(file_name)[0]
        txt_path = os.path.join(gz_directory, txt_file)

        # Decompress the .gz file to .txt
        with gzip.open(gz_path, 'rb') as f_in:
            with open(txt_path, 'wb') as f_out:
                f_out.write(f_in.read())

        # Read the .txt file into a DataFrame
        data = pd.read_csv(txt_path, sep='\t', index_col=0, on_bad_lines='skip')

        # Extract key for metadata dictionary
        key = os.path.splitext(txt_file)[0].split('_')[0]
        return key, data
    
    except Exception as e:
        print(f"❌ Failed to process {file_name}: {e}")
        return None  # Skip this file

# Dictionaries to store metadata and list to collect DataFrames
metadata_dict = {}
all_data = []

# Process the files in parallel using 3 workers
with concurrent.futures.ProcessPoolExecutor(max_workers=3) as executor:
    results = filter(None, executor.map(process_file, files))  # Filter out None results

    # Process the results
    for key, data in results:
        metadata_dict[key] = data.columns.tolist()
        all_data.append(data)

# Combine all the DataFrames along the column axis
combined_data = pd.concat(all_data, axis=1)
print("✅ All valid .txt files have been read and combined.")


Processing: GSM2629436_AB2431.txt.gz
Processing: GSM2629417_AB2333.txt.gz
Processing: GSM2629361_AB1556.txt.gz
Processing: GSM2629437_AB2432.txt.gz
Processing: GSM2629359_AB1554.txt.gz
Processing: GSM2629358_AB1553.txt.gz
Processing: GSM2629387_AB1730.txt.gz
Processing: GSM2629415_AB2331.txt.gz
Processing: GSM2629435_AB2430.txt.gz
Processing: GSM2629370_AB1713.txt.gz
Processing: GSM2629430_AB2425.txt.gz
Processing: GSM2629388_AB1731.txt.gz
Processing: GSM2629422_AB2338.txt.gz
❌ Failed to process GSM2629430_AB2425.txt.gz: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
Processing: GSM2629442_AB2437.txt.gz
Processing: GSM2629355_AB1550.txt.gz


KeyboardInterrupt: 

In [51]:
# Create a DataFrame to store metadata information
metadata_to_add = pd.DataFrame(index=combined_data.columns)



In [52]:
combined_data.shape

(23429, 1923)

In [11]:
# Ensure metadata index is string
metadata_filtered.index = metadata_filtered.index.astype(str)

# Build the reverse mapping: for every column (cell barcode), map it to its file key
column_to_file = {
    col: file_name
    for file_name, cols in metadata_dict.items()
    for col in cols
}

# Create a DataFrame mapping each cell barcode in combined_data to its file key
mapping_df = pd.DataFrame({
    'col_name': combined_data.columns,
    'file_key': combined_data.columns.map(column_to_file)
})

# Reset metadata_filtered index to a column. If the index name isn't set, it defaults to "index".
metadata_df = metadata_filtered.reset_index()
current_key_name = metadata_filtered.index.name if metadata_filtered.index.name is not None else 'index'
metadata_df.rename(columns={current_key_name: 'file_key'}, inplace=True)

# Merge the mapping with the metadata
merged_df = mapping_df.merge(metadata_df, on='file_key', how='left')

# In case there are duplicates (multiple metadata rows per cell), take the first occurrence per cell barcode
merged_df = merged_df.groupby("col_name", as_index=False).first()

# Now set the index to the cell barcodes
metadata_to_add = merged_df.set_index("col_name").drop(columns=["file_key"])


In [12]:
metadata_to_add.shape

(37248, 30)

In [13]:
import scanpy as sc

In [14]:
# Remove duplicated columns (cells) from combined_data before creating AnnData
combined_data = combined_data.loc[:, ~combined_data.columns.duplicated(keep='first')]

In [15]:
# Create AnnData object
adata = sc.AnnData(combined_data.T.values)  # Transpose to match AnnData cell-by-gene format


In [16]:
# Set obs_names (sample IDs) and var_names (gene IDs)
adata.obs_names = combined_data.columns
adata.var_names = combined_data.index


In [17]:
print("adata.obs index duplicated:", adata.obs.index.duplicated().sum())
print("metadata_to_add index duplicated:", metadata_to_add.index.duplicated().sum())


adata.obs index duplicated: 0
metadata_to_add index duplicated: 0


In [18]:
# Now this should work
adata.obs = pd.concat([adata.obs, metadata_to_add], axis=1)


In [19]:
adata

AnnData object with n_obs × n_vars = 37248 × 34016
    obs: 'Run', 'Assay Type', 'AvgSpotLen', 'Bases', 'BioProject', 'BioSample', 'Bytes', 'Center Name', 'Consent', 'DATASTORE filetype', 'DATASTORE provider', 'DATASTORE region', 'Experiment', 'GEO_Accession (exp)', 'Instrument', 'LibraryLayout', 'LibrarySelection', 'LibrarySource', 'mouse_age', 'Organ', 'Organism', 'Platform', 'ReleaseDate', 'create_date', 'version', 'selection_marker', 'source_name', 'SRA Study', 'strain', 'treatment'

In [20]:
adata.obs

Unnamed: 0,Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,...,Organism,Platform,ReleaseDate,create_date,version,selection_marker,source_name,SRA Study,strain,treatment
W660497,SRR5570912,RNA-Seq,66,1727997546,PRJNA387079,SAMN07134176,2790797468,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:43:00Z,2,CD45+,Cortex,SRP107339,5XFAD,Alzheimer's disease
W660498,SRR5570912,RNA-Seq,66,1727997546,PRJNA387079,SAMN07134176,2790797468,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:43:00Z,2,CD45+,Cortex,SRP107339,5XFAD,Alzheimer's disease
W660499,SRR5570912,RNA-Seq,66,1727997546,PRJNA387079,SAMN07134176,2790797468,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:43:00Z,2,CD45+,Cortex,SRP107339,5XFAD,Alzheimer's disease
W660500,SRR5570912,RNA-Seq,66,1727997546,PRJNA387079,SAMN07134176,2790797468,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:43:00Z,2,CD45+,Cortex,SRP107339,5XFAD,Alzheimer's disease
W660501,SRR5570912,RNA-Seq,66,1727997546,PRJNA387079,SAMN07134176,2790797468,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:43:00Z,2,CD45+,Cortex,SRP107339,5XFAD,Alzheimer's disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
W625932,SRR5570900,RNA-Seq,66,1311093300,PRJNA387079,SAMN07134158,2140769744,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:23:00Z,2,CD45+,Whole brain,SRP107339,C57BL/6,Alzheimer's disease
W625933,SRR5570900,RNA-Seq,66,1311093300,PRJNA387079,SAMN07134158,2140769744,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:23:00Z,2,CD45+,Whole brain,SRP107339,C57BL/6,Alzheimer's disease
W625934,SRR5570900,RNA-Seq,66,1311093300,PRJNA387079,SAMN07134158,2140769744,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:23:00Z,2,CD45+,Whole brain,SRP107339,C57BL/6,Alzheimer's disease
W625935,SRR5570900,RNA-Seq,66,1311093300,PRJNA387079,SAMN07134158,2140769744,GEO,public,"fastq,run.zq,sra",...,Mus musculus,ILLUMINA,2017-06-12T00:00:00Z,2017-05-16T17:23:00Z,2,CD45+,Whole brain,SRP107339,C57BL/6,Alzheimer's disease


In [21]:
adata.var

0610005C13Rik
0610007C21Rik
0610007L01Rik
0610007P08Rik
0610007P14Rik
...
snoZ159
snoZ178
snoZ39
snoZ40
snosnR60_Z15


In [22]:
# Save the AnnData object
adata.write(file_name +"_GSM_preprocessed_with_metadata.h5ad")

print("Preprocessing complete. AnnData saved with metadata.")

Preprocessing complete. AnnData saved with metadata.


## For csv.gz file

In [128]:
import scanpy as sc
import pandas as pd
# Scanpy can directly read a CSV file (assuming the file is formatted as a proper matrix)
combined_data = pd.read_csv('GSE123025_Single_myeloid_1922_cells_processed_data.csv.gz', index_col=0)


In [160]:
metadata = pd.read_csv('GSE123025_SraRunTable.csv', index_col=0)

In [161]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1922 entries, SRR8250775 to SRR8252696
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Assay Type            1922 non-null   object 
 1   AvgSpotLen            1922 non-null   int64  
 2   Bases                 1922 non-null   int64  
 3   BioProject            1922 non-null   object 
 4   BioSample             1922 non-null   object 
 5   Bytes                 1922 non-null   int64  
 6   Center Name           1922 non-null   object 
 7   Consent               1922 non-null   object 
 8   DATASTORE filetype    1922 non-null   object 
 9   DATASTORE provider    1922 non-null   object 
 10  DATASTORE region      1922 non-null   object 
 11  Experiment            1922 non-null   object 
 12  GEO_Accession (exp)   1922 non-null   object 
 13  Instrument            1922 non-null   object 
 14  LibraryLayout         1922 non-null   object 
 15  LibrarySele

In [164]:
# Remove any newline characters in each column
metadata["well_id"] = (
    metadata["well_id"]
    .astype(str)
    .str.replace(r"\n", "", regex=True)
    .str.strip()
)
metadata["plate_id"] = (
    metadata["plate_id"]
    .astype(str)
    .str.replace(r"\n", "", regex=True)
    .str.strip()
)

# Now concatenate them with a dot
metadata["Name"] = (
    metadata["well_id"] + "." + metadata["plate_id"]
)


In [165]:
# Prepare metadata for matching
metadata_filtered = metadata.set_index('Name')

In [169]:
metadata_filtered

Unnamed: 0_level_0,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,...,pooled_library_names,qc_all_3_criteria,qc_detected_genes,qc_ercc_correlation,qc_total_counts,source_name,strain,tissue,total_counts,well_id
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C9.1001200107.0,RNA-Seq,150,168482850,PRJNA507279,SAMN10488168,60931001,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,MG_YS_lib2,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,HIP,484029.0,C9
B10.1001200106.0,RNA-Seq,150,245394750,PRJNA507279,SAMN10488167,89790040,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,MG_YS_lib2,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,OB,810207.0,B10
A8.1001200106.0,RNA-Seq,150,214195950,PRJNA507279,SAMN10488166,78644825,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,MG_YS_lib2,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,OB,714754.0,A8
D6.1001200107.0,RNA-Seq,150,226942050,PRJNA507279,SAMN10488165,82714715,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,MG_YS_lib2,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,HIP,716677.0,D6
B12.1001200106.0,RNA-Seq,150,229187700,PRJNA507279,SAMN10488164,82956576,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,MG_YS_lib2,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,OB,683013.0,B12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H8.1001200140.0,RNA-Seq,150,174234150,PRJNA507279,SAMN10490715,69638065,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,N496_MG_lib5_CP,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,CP,580160.0,H8
H8.1001200141.0,RNA-Seq,150,228931950,PRJNA507279,SAMN10490714,91144992,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,N496_MG_lib5_CP,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,CP,748144.0,H8
H9.1001200136.0,RNA-Seq,150,231320100,PRJNA507279,SAMN10490713,92742851,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,N496_MG_lib5_CP,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,CP,578801.0,H9
H9.1001200140.0,RNA-Seq,150,105890250,PRJNA507279,SAMN10490712,41710381,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,N496_MG_lib5_CP,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,CP,307216.0,H9


In [176]:
new_columns = []
for col in combined_data.columns:
    new_key = col + ".0"
    if new_key in metadata_filtered.index:
        # Get the new value from metadata
        new_value = metadata_filtered.loc[new_key, "Sample Name"]
        new_columns.append(new_value)
    else:
        new_columns.append(col)

# Replace the columns in combined_data with the new column names
combined_data.columns = new_columns


In [177]:
# Create a DataFrame to store metadata information
metadata_to_add = pd.DataFrame(index=combined_data.columns)


In [178]:
metadata_to_add

GSM3491382
GSM3491383
GSM3491384
GSM3491385
GSM3491386
...
GSM3493299
GSM3493300
GSM3493301
GSM3493302
GSM3493303


In [179]:
# Prepare metadata for matching
metadata_filtered = metadata.set_index('Sample Name')

In [180]:
metadata_dict = combined_data.columns.tolist()

In [182]:
# Iterate through obs_names and match with metadata_dict
for col_name in combined_data.columns:
    print(f"col_name : {col_name}")
    # Find the corresponding key in metadata_dict
    matched_key = None
    for columns in metadata_dict:
        if col_name in columns:
            matched_key = columns
            break
    
    # If a match is found, add metadata row
    if matched_key is not None:
        print(f"matched_key: {matched_key}")
        metadata_row = metadata_filtered.loc[matched_key]
        for column in metadata_row.index:
            metadata_to_add.at[col_name, column] = metadata_row[column]

col_name : GSM3491382
matched_key: GSM3491382
col_name : GSM3491383
matched_key: GSM3491383
col_name : GSM3491384
matched_key: GSM3491384
col_name : GSM3491385
matched_key: GSM3491385
col_name : GSM3491386
matched_key: GSM3491386
col_name : GSM3491387
matched_key: GSM3491387
col_name : GSM3491388
matched_key: GSM3491388
col_name : GSM3491389
matched_key: GSM3491389
col_name : GSM3491390
matched_key: GSM3491390
col_name : GSM3491391
matched_key: GSM3491391
col_name : GSM3491392
matched_key: GSM3491392
col_name : GSM3491393
matched_key: GSM3491393
col_name : GSM3491394
matched_key: GSM3491394
col_name : GSM3491395
matched_key: GSM3491395
col_name : GSM3491396
matched_key: GSM3491396
col_name : GSM3491397
matched_key: GSM3491397
col_name : GSM3491398
matched_key: GSM3491398
col_name : GSM3491399
matched_key: GSM3491399
col_name : GSM3491400
matched_key: GSM3491400
col_name : GSM3491401
matched_key: GSM3491401
col_name : GSM3491402
matched_key: GSM3491402
col_name : GSM3491403
matched_key:

KeyError: 'F6.1001200103'

In [183]:
# Create AnnData object
adata = sc.AnnData(combined_data.T.values)  # Transpose to match AnnData cell-by-gene format

# Set obs_names (sample IDs) and var_names (gene IDs)
adata.obs_names = combined_data.columns
adata.var_names = combined_data.index

# Combine metadata with obs
adata.obs = pd.concat([adata.obs, metadata_to_add], axis=1)

In [184]:
combined_data.columns

Index(['GSM3491382', 'GSM3491383', 'GSM3491384', 'GSM3491385', 'GSM3491386',
       'GSM3491387', 'GSM3491388', 'GSM3491389', 'GSM3491390', 'GSM3491391',
       ...
       'GSM3493294', 'GSM3493295', 'GSM3493296', 'GSM3493297', 'GSM3493298',
       'GSM3493299', 'GSM3493300', 'GSM3493301', 'GSM3493302', 'GSM3493303'],
      dtype='object', length=1922)

In [185]:
combined_data.index

Index(['0610005C13Rik', '0610007C21Rik', '0610007L01Rik', '0610007N19Rik',
       '0610007P08Rik', '0610007P14Rik', '0610007P22Rik', '0610008F07Rik',
       '0610009B14Rik', '0610009B22Rik',
       ...
       'Zxda', 'Zxdb', 'Zxdc', 'Zyg11a', 'Zyg11b', 'Zyx', 'Zzef1', 'Zzz3', 'a',
       'l7Rn6'],
      dtype='object', length=23429)

In [186]:
adata

AnnData object with n_obs × n_vars = 1922 × 23429
    obs: 'Assay Type', 'AvgSpotLen', 'Bases', 'BioProject', 'BioSample', 'Bytes', 'Center Name', 'Consent', 'DATASTORE filetype', 'DATASTORE provider', 'DATASTORE region', 'Experiment', 'GEO_Accession (exp)', 'Instrument', 'LibraryLayout', 'LibrarySelection', 'LibrarySource', 'Organism', 'Platform', 'ReleaseDate', 'SRA Study', 'create_date', 'version', 'AGE', 'detected_genes', 'gate', 'genotype', 'plate_id', 'pooled_library_names', 'qc_all_3_criteria', 'qc_detected_genes', 'qc_ercc_correlation', 'qc_total_counts', 'source_name', 'strain', 'tissue', 'total_counts', 'well_id', 'Name'

In [187]:
adata.obs

Unnamed: 0,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,...,qc_all_3_criteria,qc_detected_genes,qc_ercc_correlation,qc_total_counts,source_name,strain,tissue,total_counts,well_id,Name
GSM3491382,RNA-Seq,150.0,168482850.0,PRJNA507279,SAMN10488168,60931001.0,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,HIP,484029.0,C9,C9.1001200107.0
GSM3491383,RNA-Seq,150.0,245394750.0,PRJNA507279,SAMN10488167,89790040.0,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,OB,810207.0,B10,B10.1001200106.0
GSM3491384,RNA-Seq,150.0,214195950.0,PRJNA507279,SAMN10488166,78644825.0,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,OB,714754.0,A8,A8.1001200106.0
GSM3491385,RNA-Seq,150.0,226942050.0,PRJNA507279,SAMN10488165,82714715.0,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,HIP,716677.0,D6,D6.1001200107.0
GSM3491386,RNA-Seq,150.0,229187700.0,PRJNA507279,SAMN10488164,82956576.0,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3",...,Y,Y,Y,Y,Brain myeloid cell,C57BL6N,OB,683013.0,B12,B12.1001200106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM3493299,,,,,,,,,,,...,,,,,,,,,,
GSM3493300,,,,,,,,,,,...,,,,,,,,,,
GSM3493301,,,,,,,,,,,...,,,,,,,,,,
GSM3493302,,,,,,,,,,,...,,,,,,,,,,


In [188]:
adata.var

0610005C13Rik
0610007C21Rik
0610007L01Rik
0610007N19Rik
0610007P08Rik
...
Zyx
Zzef1
Zzz3
a
l7Rn6


In [189]:
# Save the AnnData object
adata.write("GSE123025" +"_GSM_preprocessed_with_metadata.h5ad")

print("Preprocessing complete. AnnData saved with metadata.")

Preprocessing complete. AnnData saved with metadata.
