# ANARCI 

## Imports

In [5]:
import os
import sys
from anarci import anarci
import numpy as np
import pandas as pd

## Data Load

In [42]:
data = pd.read_csv('../data/raw/dataset_w_partition.csv', header=0, index_col=0)
display(data)

Unnamed: 0,Antibody Name,Epitope Group,source,D614G,BA.1,BA.2,BA.2.75,BA.5,BQ.1.1,XBB,Heavy chain V gene,Heavy chain J gene,Light chain V gene,Light chain J gene,Heavy chain AA,Light chain AA,clus_vh,clus_vl,connected,partition
0,BD-196,C,WT convalescents,1.93,>10,>10,>10,>10,>10,>10,IGHV1-3,IGHJ6,IGKV2-58,IGKJ1,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYAMHWVRQAPGQRLE...,DVVMTQSPLSLPVTLGQPASISCRSSQSLVYSDGNTYLNWFQQRPG...,1442,682,0,Validation
1,BD-218,A,WT convalescents,0.0104,0.00849,0.00937,0.0116,5.26,>10,>10,IGHV4-34,IGHJ1,IGKV3-11,IGKJ5,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSDYFWYWIRQPPGKGLE...,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...,1443,508,1,Train
2,BD-236,A,WT convalescents,0.0173,>10,>10,>10,>10,>10,>10,IGHV3-53,IGHJ6,IGKV1-9,IGKJ4,EVQLVESGGGLIQPGGSLRLSCAASGITVSSNYMSWVRQAPGKGLE...,IQLTQSPSSLSASVGDRVTITCRASQGISSYLAWYQQKPGKAPKLL...,595,591,1,Test
3,BD-254,C,WT convalescents,0.25,>10,>10,>10,>10,>10,>10,IGHV3-23,IGHJ4,IGLV7-46,IGLJ2,EVQLLESGGGLVQPGGSLRLSCAASGFTLSSYAMSWVRQAPGRGLE...,QAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQKPGQAP...,1444,683,2,Validation
4,BD-255,B,WT convalescents,0.113,>10,>10,>10,>10,>10,>10,IGHV3-30,IGHJ6,IGKV3-15,IGKJ2,QVQLVESGGGVVQPGRSLRLSCVASGFTFSNYDMHWVRQAPGKGLE...,EIVMTQSPAILSVSPGERATLSCRASQSVTRNLAWYQQKPGQAPRL...,1450,684,3,Validation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3046,BD55-6726,E2.2,SARS convalescents,>10,>10,>10,>10,>10,>10,>10,IGHV1-58,IGHJ2,IGKV3-24,IGKJ1,QMQLVQSGPEVKRPGTSVKVSCEASGFTFSSSAILWVRQPRGQRLE...,EIVLRQSPATVSLSPGERATLSCRASQSVHNYLAWFQQKPGQAPRL...,2157,913,807,Validation
3047,BD55-6727,F2,SARS convalescents,0.293,4.37,>10,>10,>10,>10,>10,IGHV3-11,IGHJ1,IGKV1-39,IGKJ2,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMNWIRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSISNYLNWYQQKPGKAPKL...,2158,456,1,Train
3048,BD55-6728,F1,SARS convalescents,>10,>10,>10,>10,>10,>10,>10,IGHV3-64,IGHJ2,IGKV1-39,IGKJ1,EVQLVESGGGLVQPGGSLRLSCAASGFTFSRFAMHWVRQAPGKGLE...,DIQMTQSPSSLSASLGDRVTITCRASQTISRYLNWYQQKPGKAPKV...,2159,456,1,Train
3049,BD55-6729,F1,SARS convalescents,>10,>10,>10,>10,>10,>10,>10,IGHV1-18,IGHJ6,IGLV3-25,IGLJ2,QVQLVQSGAEVKKPGTSMRVSCKASGYTFSTYGIIWVRQAPGQGLE...,SYELTQPPSVSVSPGQTARITCSGDAVATQFLYWYQQKSGQAPVMV...,2160,914,808,Validation


## Run ANARCI

In [43]:
heavy_seqs = [(name + '_H', seq) for name, seq in data[['Antibody  Name', 'Heavy chain AA']].values]
light_seqs = [(name + '_L', seq) for name, seq in data[['Antibody  Name', 'Light chain AA']].values]

In [44]:
heavy_results = anarci(heavy_seqs, scheme="imgt", output=False)
light_results = anarci(light_seqs, scheme="imgt", output=False)

In [46]:
hc_numbering, hc_alignment_details, hc_hit_tables = heavy_results
lc_numbering, lc_alignment_details, lc_hit_tables = light_results

## Extract CDR regions

In [47]:
## Function from ChatGPT

def extract_cdr_regions(imgt_output, chain_type="H"):
    """
    Extracts CDR regions from IMGT numbering output from ANARCI.

    Parameters:
        imgt_output (list): A list of tuples representing the IMGT numbering output.
                            Each tuple is in the format ((position, insertion), residue).
        chain_type (str): The chain type, "H" for heavy chain or "L" for light chain.

    Returns:
        dict: A dictionary with keys 'CDR1', 'CDR2', and 'CDR3', where values are strings of residues in each region.
    """
    # Define IMGT CDR regions based on chain type
    cdr_ranges = {
        "H": {"CDR1": range(27, 39), "CDR2": range(56, 66), "CDR3": range(105, 118)},
        "L": {"CDR1": range(24, 35), "CDR2": range(50, 57), "CDR3": range(89, 98)},
    }

    # Check if the chain type is valid
    if chain_type not in cdr_ranges:
        raise ValueError("Invalid chain type. Use 'H' for heavy chain or 'L' for light chain.")

    # Extract CDR ranges for the specified chain type
    cdr_definitions = cdr_ranges[chain_type]

    # Initialize CDR regions
    cdr_regions = {cdr: "" for cdr in cdr_definitions}

    # Parse the IMGT output
    for (position, insertion), residue in imgt_output:
        for cdr, cdr_range in cdr_definitions.items():
            if position in cdr_range:
                cdr_regions[cdr] += residue

    return cdr_regions


def add_cdr_to_df(df, numbering, chain):
    '''Make sure that index is numbered and same order'''
    cdr_list = []
    
    for seq_numbering in numbering:
        cdrs = extract_cdr_regions(seq_numbering[0][0])
        cdr_list.append(cdrs)
    cdr_df = pd.DataFrame(cdr_list)
    cdr_df.columns = [f'CDR{chain}{i+1}' for i in range(3)]
    df = pd.merge(data, cdr_df, left_index=True, right_index=True)

    return df

In [48]:
data = add_cdr_to_df(data, hc_numbering, chain='H')
data = add_cdr_to_df(data, lc_numbering, chain='L')

save dataframe

In [49]:
data.to_csv('../data/processed/dataset_w_cdr_annotation.csv', index=False)