<a href="https://colab.research.google.com/github/Madhuanabala/capstone/blob/main/translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [20]:
import csv
from Bio.Seq import Seq

def translate_dna_to_protein(dna_sequence, seq_type="cDNA", trim_to_multiple_of_3=False, pad_sequence=False):
    """
    Translates DNA sequence (cDNA or gDNA) to protein sequence.

    Args:
        dna_sequence (str): The input DNA sequence (cDNA or gDNA).
        seq_type (str): The type of the sequence ("cDNA" or "gDNA"). Default is "cDNA".
        trim_to_multiple_of_3 (bool): Whether to trim the sequence to a multiple of 3 (default is False).
        pad_sequence (bool): Whether to pad the sequence with 'N' to make its length a multiple of 3 (default is False).

    Returns:
        str: The translated protein sequence.
    """

    dna_sequence = dna_sequence.replace(" ", "").replace("\n", "").upper()


    if len(dna_sequence) % 3 != 0:
        if trim_to_multiple_of_3:

            dna_sequence = dna_sequence[:len(dna_sequence) - (len(dna_sequence) % 3)]
            print(f"Trimmed the sequence to a length that is a multiple of 3: {dna_sequence}")
        elif pad_sequence:

            pad_length = 3 - (len(dna_sequence) % 3)
            dna_sequence += "N" * pad_length
            print(f"Padded the sequence to a length that is a multiple of 3: {dna_sequence}")
        else:
            print(f"Warning: The sequence length is not a multiple of 3. Translation may be incomplete for {dna_sequence}.")


    dna = Seq(dna_sequence)


    protein = dna.translate()

    return str(protein)

def read_sequences_from_csv(input_file):
    """
    Reads DNA sequences from a CSV file.

    Args:
        input_file (str): Path to the CSV file containing DNA sequences.

    Returns:
        list: A list of dictionaries with 'Sequence Name' and 'Sequence' keys.
    """
    sequences = []
    with open(input_file, mode='r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            sequences.append({'name': row['Sequence Name'], 'sequence': row['Sequence']})
    return sequences

def write_proteins_to_csv(protein_sequences, output_file):
    """
    Writes translated protein sequences to a CSV file.

    Args:
        protein_sequences (list): List of dictionaries containing protein sequence data.
        output_file (str): Path to the output CSV file.
    """
    with open(output_file, mode='w', newline='') as outfile:
        fieldnames = ['Sequence Name', 'Protein Sequence']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        for seq in protein_sequences:
            writer.writerow(seq)

def process_sequences(input_file, output_file, seq_type="cDNA", trim_to_multiple_of_3=False, pad_sequence=False):
    """
    Reads DNA sequences from a CSV file, translates them to protein, and writes the results to another CSV file.

    Args:
        input_file (str): Path to the input CSV file containing DNA sequences.
        output_file (str): Path to the output CSV file where protein sequences will be saved.
        seq_type (str): The type of the sequence ("cDNA" or "gDNA"). Default is "cDNA".
        trim_to_multiple_of_3 (bool): Whether to trim the sequence to a multiple of 3.
        pad_sequence (bool): Whether to pad the sequence to a multiple of 3.
    """

    sequences = read_sequences_from_csv(input_file)


    protein_sequences = []
    for seq in sequences:
        if seq['sequence'].strip():
            protein = translate_dna_to_protein(seq['sequence'], seq_type, trim_to_multiple_of_3, pad_sequence)
            protein_sequences.append({'Sequence Name': seq['name'], 'Protein Sequence': protein})
        else:
            protein_sequences.append({'Sequence Name': seq['name'], 'Protein Sequence': "Invalid sequence"})


    write_proteins_to_csv(protein_sequences, output_file)

input_csv_file = '/content/antibody sequences from imgt (CEA).csv'
output_csv_file = 'output_protein_sequences.csv'
process_sequences(input_csv_file, output_csv_file, seq_type="cDNA", trim_to_multiple_of_3=True)


Trimmed the sequence to a length that is a multiple of 3: CGCTGCTGAGTCTCAGGCTGAATGTGTTCTTGCTGGTGT
Trimmed the sequence to a length that is a multiple of 3: TGAGGAGACGGTGACCGTGGTCCCTTGGCCCCA
Trimmed the sequence to a length that is a multiple of 3: CGCTGCTGAGTCTCAGGTAGGCTGTGTTCTTGCTGGTGT
Trimmed the sequence to a length that is a multiple of 3: GCGGGCCTCTTCGCTATTACG
Trimmed the sequence to a length that is a multiple of 3: GGAAGCTTGAAGATGGATACAGTTGGTGCA
Trimmed the sequence to a length that is a multiple of 3: GAGGTTCAGCTGCAGCAGTCTGGGGCAGAGCTTGTGAGGTCAGGGGCCTCAGTCAAGATGTCCTGCACAGCTTCTGGCTTCAACATTAAAGACTACTATATGCACTGGGTGAAGCAGAGGCCTGAACAGGGCCTGGAGTGGATTGGATGGATTGATCCTGAGAATGGTGATACTGAATATGCCCCGAAGTTCCAGGGCAAGGCCACTATGACTACAGACACATCCTCCAACACAGCCTACCTGCAGCTCAGCAGCCTGACATCTGAGGACACTGCCGTCTATTACTGTAATACACGGGGTCTATCTACTATGATTACGACGCGTTGGTTCTTCGATGTCTGGGGCGCAGGGACCACGGTCGCCGTCTCCTCT
Trimmed the sequence to a length that is a multiple of 3: TTGAATTCGGAGCTGATGGGAACATTGTAA
Trimmed the sequence to