# CARBonAra for sequence design

In [1]:
from carbonara import CARBonAra, imprint_sampling

## Generating sequences

In [2]:
# initialize carbonara model and define device for acceleration
carbonara = CARBonAra(device_name="cuda")

### Simple sequence generation for monomers, multimers and with context

General sequence sampling with all required and optional options.

In [3]:
# sample sequences
sequences, scores, pssm, structure_scaffold = imprint_sampling(
    # required arguments
    carbonara=carbonara,                    # runtime for the carbonara model
    pdb_filepath="examples/pdbs/1zns.pdb",  # input scaffold structure filepath
    num_sample=100,                         # number of sequences to sample
    imprint_ratio=0.5,                      # percentage of the prediction to use as prior information for sampling
    # optional arguments
    b_sampled=True,                         # if true, uses sampling from probability for more diversity, if false, uses maximum confidence for sampling
    known_chains=[],                        # list of known chains (e.g. ['A', 'B']) for partial sequence prediction
    known_positions=[],                     # list of known sequence position (e.g. [37, 38, 39, 40]) for partial sequence prediction
    unknown_positions=[],                   # list of unknown sequence position (e.g. [37, 38, 39, 40]) and will overwite the other known flags
    ignore_hetatm=False,                    # flag to ignore hetatm in the structure
    ignore_wat=False,                       # flag to ignore water in the structure
)

100%|██████████| 100/100 [00:07<00:00, 13.02it/s]


Outputs of the `imprint_sampling`:

* `sequences`: list of generated sequences
* `scores`: average confidence of the generated sequences
* `pssm`: position-specific scoring matrix (carbonara raw output with prior information)
* `structure_scaffold`: dictionary containing the structure information of the scaffold

## Partial sequence generation using prior sequence information

Sample sequences with chain `A` known to the model and fixed.

In [4]:
# sample sequences with chain 'A' known to the model and fixed
sequences, scores, pssm, structure_scaffold = imprint_sampling(
    # required arguments
    carbonara=carbonara,                            # runtime for the carbonara model
    pdb_filepath="examples/pdbs/2oob.pdb",          # input scaffold structure filepath
    num_sample=100,                                 # number of sequences to sample
    imprint_ratio=0.5,                              # percentage of the prediction to use as prior information for sampling
    known_chains=['A'],                             # list of known chains (e.g. ['A', 'B']) for partial sequence prediction
)

100%|██████████| 100/100 [00:05<00:00, 18.15it/s]


Partial sequence generation can also be controled with:
* `known_positions`: list of sequence position from the reference structure that are known to the model and will be fixed in the output sequence
* `unknown_positions`: list of unknown sequence position that will be sampled and the rest of the sequence(s) will be known and fixed. This option will overwrite the other as it will set everything to be known except the listed positions.

### Saving structure scaffold

The b-factor column of the scaffold contains the known (1.0) and to unknown (0.0). This feature is useful to easily verify and debug partial sequence prediction.

In [5]:
from carbonara import save_pdb, split_by_chain

In [6]:
save_pdb(split_by_chain(structure_scaffold), "examples/pdbs/2oob_scaffold.pdb")

## Exporting the outputs

### Saving generated sequences and scores

In [7]:
from carbonara import write_fasta

In [8]:
# the sequences are split by chain and can be concatenate in format compatible with AlphaFold
print(f"sequence: {':'.join(sequences[0])}")
print(f"score: {scores[0]:.3f}")

# saving generated sequence to a fasta file
write_fasta("examples/pdbs/2oob_example.fasta", ':'.join(sequences[0]), info="sequence generated for 2oob with chain A known")

sequence: LENVDAKIAKLMGEGYAFEEVKRALEIAQNNVEVARSILREFAF:MQIYVKTLDGKTITLDVEPSDTIENVKQKIQEKEGIPPDNQRLIYAGRELEDGRTLSDYNIQKNSTLYLVLR
score: 0.866


### Saving the Position-Specific Scoring Matrix

In [9]:
import pandas as pd
from carbonara import std_aminoacids

In [10]:
df = pd.DataFrame(pssm, columns=std_aminoacids)
df.round(2)

Unnamed: 0,LEU,GLU,ARG,LYS,VAL,ILE,PHE,ASP,TYR,ALA,THR,SER,GLN,ASN,PRO,GLY,HIS,TRP,MET,CYS
0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
1,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,1.00,0.0,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,0.00,0.00,0.00,0.00,0.00,0.00,0.52,0.0,0.99,0.00,0.00,0.00,0.01,0.01,0.0,0.00,0.74,0.02,0.01,0.03
112,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.01,0.00,0.00,0.01,0.00,0.0,0.02,0.00,0.00,0.45,0.01
113,0.00,0.00,0.00,0.00,1.00,0.01,0.00,0.0,0.00,0.01,0.02,0.00,0.00,0.01,0.0,0.00,0.00,0.00,0.00,0.19
114,0.97,0.16,0.03,0.02,0.15,0.64,0.02,0.0,0.01,0.00,0.06,0.00,0.10,0.00,0.0,0.00,0.01,0.01,0.14,0.01


In [11]:
df.to_csv("examples/pdbs/2oob_example_pssm.csv")