# New clusters annotation

## Libraries imports

In [1]:
import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq
import glob

## Data

In [2]:
DATA_FOLDER = "../Data/Sequences/Clusters reference sequences/"
OUTPUT_FOLDER = "../Data/New clusters annotations/"
OUTPUT_K12_GENES = OUTPUT_FOLDER+"K12/"
OUTPUT_B6_FOLDER = OUTPUT_FOLDER+"output_files_b6/"

K12_FILE = DATA_FOLDER+"K12.faa" # K12 sequences (fasta file)

## Creating queries for vsearch searches

In [3]:
queries = SeqIO.to_dict(SeqIO.parse(K12_FILE, "fasta"))
for seq in queries.values():
    SeqIO.write([seq],OUTPUT_K12_GENES+(seq.id).split(".1_")[1]+".fasta","fasta")

## Parsing vsearch outputs

In [5]:
colnames = ["query sequence id","subject sequence id","percentage of identical matches","alignment length","number of mismatches","number of gap openings","start of alignment in query"," end of alignment in query","start of alignment in subject","end of alignment in subject","expect value","bit score"]
annotations_list = []

for file in glob.glob(OUTPUT_B6_FOLDER+"*"):
    gene = (file.split("/")[-1])[:-3]
    df = pd.read_csv(file,sep="\t",names=colnames,header=None)
    gene_length = len(queries['lcl|CU651637.1_'+gene].seq)
    subject_length = df["end of alignment in subject"]<1.1*gene_length
    is_full_sequence = df["alignment length"]>(0.8*gene_length)
    sequence_id = df["subject sequence id"]
    identical_matches = df["percentage of identical matches"]
    algnt_length = df["alignment length"]
    description = (queries['lcl|CU651637.1_'+gene].description).strip("lcl|CU651637.1_")
    gene_length = len(str(queries['lcl|CU651637.1_'+gene].seq))
    gene_name = ""
    if("[gene=" in description):
        gene_name = description.split("[gene=")[1].split("]")[0]
    annotations = {"Gene id":[],"Gene name":[],"Description":[],"Gene length":[],"Sequence id":[], "Identical matches":[], "Alignment length":[], "Full sequence":[]}
    for i in range(len(df)):
        if(subject_length[i]):
            annotations["Gene id"].append(gene)
            annotations["Gene name"].append(gene_name)
            annotations["Description"].append(description)
            annotations["Gene length"].append(gene_length)
            annotations["Sequence id"].append(sequence_id[i])
            annotations["Identical matches"].append(identical_matches[i])
            annotations["Alignment length"].append(algnt_length[i])
            annotations["Full sequence"].append(is_full_sequence[i])
    annotations_list.append(pd.DataFrame.from_dict(annotations))

result = pd.concat(annotations_list,ignore_index=True)
result["Full sequence"] = result["Full sequence"].astype("bool")
full_sequences = result[result["Full sequence"]]
duplicates_FS = full_sequences.duplicated(subset=["Sequence id"])
fragments = result[~result["Full sequence"]]
duplicates_Fr = fragments.duplicated(subset=["Sequence id"])
for i in fragments.index:
    seqid = fragments["Sequence id"][i]
    if(seqid in full_sequences["Sequence id"]):
        duplicates_Fr[i] = True
fragments[~duplicates_Fr].to_csv(OUTPUT_FOLDER+"fragments_no_duplicates.csv", sep=";", index=False)
full_sequences[~duplicates_FS].to_csv(OUTPUT_FOLDER+"full_sequences_no_duplicates.csv", sep=";", index=False)