In [None]:
!pip install Bio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from Bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, Bio
Successfully installed Bio-1.5.9 biopython-1.81 bio

In [None]:
import sys
from Bio import Entrez
import pandas as pd
import os
import scanpy as sc
import numpy as np

In [None]:
cadata = sc.read("datasets/processed_cellcounrs.h5ad")

# **Create input files for R (Estimate)**

In [None]:
!mkdir R_time

# Transpose the expression matrix
expression_matrix = cadata.X.T

# Get the gene and cell names
gene_names = cadata.var_names
cell_names = cadata.obs_names
lines = []

# Add the header line with cell names
header = "gene\t" + "\t".join(cell_names)
lines.append(header)

# Add the data lines with gene names and expression values
for i in range(expression_matrix.shape[0]):
    gene_name = gene_names[i]
    expression_values = "\t".join(expression_matrix[i].astype(str))
    line = gene_name + "\t" + expression_values
    lines.append(line)

# Write the lines to a text file in the Seurat_files directory
output_directory = "R_time"
os.makedirs(output_directory, exist_ok=True)
output_file = os.path.join(output_directory, "input_Estimate_format.txt")
with open(output_file, "w") as file:
    file.write("\n".join(lines))

# **Create input files for R (Genefu)**

In [None]:
tumor_indices = np.where(cadata.obs['class'] == 'Tumor')[0]
expr_data = cadata.X[tumor_indices, :]

In [None]:
# Save gene expression data to a CSV file
np.savetxt('R_time/gene_expression.csv', expr_data, delimiter=',')

# Save features.csv
features = pd.DataFrame({'GeneID': cadata.var_names})
features.to_csv('R_time/gene_names.csv', sep=',', index=False)

# Convert cell_names to a one-dimensional string array
cell_names_arr = np.array(cadata.obs_names)[tumor_indices].astype(str)

barcodes = pd.DataFrame({'CellID': cell_names_arr})
barcodes.to_csv('R_time/cell_names.csv', sep=',', index=False)

# **API for Entrez ID**

In [None]:
Entrez.email = "s.alvanakis@gmail.com"

In [None]:
dfs = pd.read_csv('R_time/gene_names.csv')

In [None]:
genes_list = dfs["GeneID"].values.tolist()

In [None]:
def get_entrez_gene_id(gene_name):
    handle = Entrez.esearch(db="gene", term=gene_name)
    record = Entrez.read(handle)
    handle.close()
    if len(record["IdList"]) > 0:
        return record["IdList"][0]
    else:
        return None


In [None]:
entrez_gene_ids = []

In [None]:
for gene_name in genes_list:
    entrez_gene_id = get_entrez_gene_id(gene_name)
    entrez_gene_ids.append(entrez_gene_id)


In [None]:
dfs['Entrez_ID'] = entrez_gene_ids

In [None]:
# Save DataFrame as a CSV file
dfs.to_csv('Entrez_id.csv', sep=',', index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>