# Annotate Seurat's "FindMarkers" upregulated genes using GPT4 LLM

function calling allows us to use parasable json structured data

### Add openai's python module

In [1]:
!pip install -q openai

In [2]:
import os
import json
import pandas as pd
from getpass import getpass

import openai

### Get the user's API key

In [3]:
# Prompt for secure key input
api_key = getpass('Enter your OPENAI API key: ')

# Set the key as an environment variable

os.environ['OPENAI_API_KEY'] = api_key

Enter your OPENAI API key:  ········


### Read the markers table

in this example we have saved rownames on export from R

In [4]:
markers = pd.read_csv('/sbgenomics/project-files/scRNAseq_processing/seurat_scRNA_filtered_downsampled.harmonyJLW.markers.csv',index_col=0)

In [5]:
markers.head()

Unnamed: 0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene
chr3:93470147-93471055,0.0,4.395002,0.356,0.06,0.0,0,chr3:93470147-93471055
CD96,0.0,1.866024,0.375,0.152,0.0,0,CD96
SLFN12L,0.0,1.818384,0.317,0.132,0.0,0,SLFN12L
LINC01934,0.0,1.678011,0.276,0.114,0.0,0,LINC01934
CD247,0.0,1.598508,0.27,0.109,0.0,0,CD247


### Set parameters for genes that will inform clusters

In [6]:
n_genes = 20
orderby_columns = ['avg_log2FC','p_val']
orderby_ascending = [False,True]
max_p_value_adj = 0.05
min_avg_log2FC = 0.25
drop_gene_strings = ['chr']

In [7]:
ordered_markers = markers.sort_values(orderby_columns,ascending=orderby_ascending).reset_index(drop=True)
for ds in drop_gene_strings:
    ordered_markers = ordered_markers.loc[~ordered_markers['gene'].str.contains(ds),:]
ordered_markers = ordered_markers.loc[ordered_markers['p_val_adj'] < max_p_value_adj,:]
print("Number of clusters:")
print(len(ordered_markers['cluster'].unique()))
cluster_obj = {}
for i in sorted(ordered_markers['cluster'].unique()):
    sub = ordered_markers.loc[ordered_markers['cluster']==i,:]
    glist = list(sub.head(n=n_genes)['gene'])
    cluster_obj[i] = glist

Number of clusters:
30


### Set up our GPT function

In [8]:
GPT_MODEL = "gpt-4"


cell_types_allowed = ["TUFT", "ENDOCRINE", "B/PLASMA", "GRANULOCYTE", "EPITHELIAL", "ACINAR", "MYELOID", "T/NK", "PERICYTE", "FIBROBLASTS", "SMOOTH MUSCLE", "DENDRITIC", "ENDOTHELIAL", "MAST CELL", "RED BLOOD CELL"]
context = """
Given this sell type rough categories and a few example genes:
'POU2F3','VAV1', #TUFT
                                           'INSR','CHGA',"TTR", #ENDOCRINE
                                           'CD79A','MS4A1','IGJ','IGLL5', #B/PLASMA
                                           "G0S2", 'CXCR2',"CXCR1", #GRANULOCYTE
                                           'KRT18','KRT8','TFF1','KRT19','SPINK1', #EPITHELIAL
                                           "PRSS1",'AMY2A','CTRB2','REG1A',#ACINAR
                                           'LYZ','APOE', 'HLA-DRA', 'C1QA','CD14', "CD68", #MYELOID
                                           'CD2','CD3D','NKG7','NCAM1','CD8A','CD4','FOXP3', #T AND NK
                                           'IGFBP7','ACTA2','RGS5', #PERICYTE
                                           'COL1A1','DCN','LUM','CDH11', #FIBROBLASTS
                                           'TAGLN','MYL9', #SMOOTH MUSCLE
                                           'IRF7','GZMB','CCL22','LAMP3',"HLA-DQA1", #DENDRITIC
                                           'PLVAP','VWF', #ENDOTHELIAL
                                           'TPSAB1','CPA3', #MAST CELL
                                           'HBB','HBA2' #RED BLOOD CELL

We want to classify some clusters as being either
TUFT, ENDOCRINE, B/PLASMA, GRANULOCYTE, EPITHELIAL, ACINAR, MYELOID, T/NK, PERICYTE, FIBROBLASTS, SMOOTH MUSCLE, DENDRITIC, ENDOTHELIAL, MAST CELL, or RED BLOOD CELL (just one that is the best match)
""".strip()

In [9]:
tools = [
  {
    "type": "function",
    "function": {
      "name": "cluster_assignment",
      "description": "Relay the decision on which cell type the genes provided best describe.",
      "parameters": {
        "type": "object",
        "properties": {
          "evidence": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "gene_name": {
                  "type": "string",
                  "description": "A gene that is informative for selecting the cell type"
                },
                "reason_to_use": {
                  "type": "string",
                  "description": "The reason this gene is informative for selecting the cell type"
                }
              },
              "required": [
                "gene_name",
                "reason_to_use"
              ]
            },
            "description": "A list of genes and their evidence that are helpful for determining cell type, do NOT list genes that are not helpful int he final cell type assignment."
          },
          "cell_type_assignment": {
            "type": "string",
            "enum": cell_types_allowed + ([] if "UNKNOWN" in cell_types_allowed else ["UNKNOWN"]),
            "description": "The most likely cell type if known."
          }
        },
        "required": [
          "evidence",
          "cell_type_assignment"
        ]
      }
    }
  }
]



### Iterate through and label using GPT

In [10]:
def make_messages(glist):
    messages = []
    messages.append({"role":"system","content":"""
Your job is to determine which upregulated genes 
from a list give best evidence for a cluster give evidence of that cluster representing a specific cell type from among a specific list of cell types. 
Do not guess and take your time to reason the evidence for which genes work best.  Choose type 'UNKONWN' if its mixed or unkonwn.""".strip()
})
    messages.append({"role":"user", "content":"""
For each cluster I present you will have a list of some highly upregulated genes for each cluster,
first say which genes are most informative for a cell type and why, then choose one most appropriate cell type for it:
                """.strip()+"\n"+" ,".join(glist)})
    return messages

In [11]:
print(json.dumps(tools,indent=2))

[
  {
    "type": "function",
    "function": {
      "name": "cluster_assignment",
      "description": "Relay the decision on which cell type the genes provided best describe.",
      "parameters": {
        "type": "object",
        "properties": {
          "evidence": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "gene_name": {
                  "type": "string",
                  "description": "A gene that is informative for selecting the cell type"
                },
                "reason_to_use": {
                  "type": "string",
                  "description": "The reason this gene is informative for selecting the cell type"
                }
              },
              "required": [
                "gene_name",
                "reason_to_use"
              ]
            },
            "description": "A list of genes and their evidence that are helpful for determining cell type, do

In [12]:
client = openai.OpenAI()
cluster_assignment = {}
for i, glist in cluster_obj.items():
    use_messages = make_messages(glist)
    response = client.chat.completions.create(
        model=GPT_MODEL,
        messages=make_messages(glist),
        tools = tools,
       tool_choice={"type": "function", "function": {"name": "cluster_assignment"}}
    )
    try:
        assignment = json.loads(json.loads(response.model_dump_json())['choices'][0]['message']['tool_calls'][0]['function']['arguments'])
    except:
        assignment = {'evidence':[],'cell_type_assignment':'UNKNOWN'}
    print("------------")
    print(f"CLUSTER {i}")
    print(json.dumps(assignment,indent=2))
    cluster_assignment[str(i)] = assignment #for string for json compatiability

------------
CLUSTER 0
{
  "evidence": [
    {
      "gene_name": "CD247",
      "reason_to_use": "This gene encodes a T-cell specific surface glycoprotein involved in immune response."
    },
    {
      "gene_name": "THEMIS",
      "reason_to_use": "This gene plays a key role in the regulation of T-cell development."
    },
    {
      "gene_name": "BCL11B",
      "reason_to_use": "BCL11B is essential in the development of T cells."
    },
    {
      "gene_name": "FYN",
      "reason_to_use": "FYN is involved T-cell receptor signaling."
    },
    {
      "gene_name": "PTPRC",
      "reason_to_use": "PTPRC is involved in the T cell receptor signaling pathway, and its expression is known to increase during T-cell activation."
    },
    {
      "gene_name": "IKZF1",
      "reason_to_use": "IKZF1 regulates lymphoid development and function, notably in T-cell lineage."
    },
    {
      "gene_name": "STAT4",
      "reason_to_use": "STAT4 is involved in transmitting signals in the immu

In [13]:
with open('/sbgenomics/output-files/cell_type_assignment2.gpt4.json','w') as of:
    of.write(json.dumps(cluster_assignment,indent=2))