In [1]:
import os
import pandas as pd
import subprocess
import shutil
from tqdm import tqdm

names = ["dG", "deep_preds"]

## Prepare directory structure

In [2]:
shutil.rmtree("../fastas")
os.makedirs("../fastas", exist_ok=True)
os.makedirs("../predictions", exist_ok=True)
os.makedirs("../results", exist_ok=True)

## Helper methods

In [3]:
def parse_list(list_string: str) -> list:
    list_string = list_string[1:-1].replace("'", "")
    if list_string and list_string != "":
        return list_string.split(", ")
    else:
        return []

def select_positions(list: list, positions: list) -> list:
    return [list[pos] for pos in positions]

In [4]:
summary_table = pd.read_excel("../data/TME-Analysis_Legionella-spp.xlsx", sheet_name="Philadelphia 1", skiprows=4)
def row_by_refseq(refseq: str) -> pd.DataFrame:
    return summary_table.loc[summary_table["RefSeq"] == refseq]


In [5]:
info_map = {}
def extend_info_map(idx:str , sequence: str, length, protein):
    info_map[idx] = {
        "sequence": sequence,
        "length": length,
        "Gene name": "/".join(protein["Gene name"].values),
        "Protein name": "/".join(protein["Protein name"].values),
        "RefSeq": protein["RefSeq"].values[0],
        "UniProt ID": protein["UniProt ID"].values[0],
    }

def write_fasta(out_path, idx, sequence):
    with open(out_path, "a") as outfile:
        outfile.write(f"{idx}\n")
        outfile.write(f"{sequence}\n")

## Fastas

### Extract fasta from DeltaG predictions

In [6]:
in_path = "../data/Legionella_pneumophila_str._Philadelphia_1_effector_proteins_FASTA_18-35.xlsx"
out_path = "../fastas/dG.fasta"
dG_preds = pd.read_excel(in_path, 0, index_col=0).dropna(axis="rows", subset=["M sequences"])
for i, row in dG_preds.iterrows():
    sequences = parse_list(row["M sequences"])
    if sequences and len(sequences) > 0:
        lengths = parse_list(row["M lengths"])
        refseq = str(row["name"]).split("|")[3]
        protein = row_by_refseq(refseq)

        domain_counter = 1
        for i, sequence in enumerate(sequences):
            idx = f"{row["name"]} - Domain {domain_counter}"

            extend_info_map(idx, sequence, lengths[i], protein)

            write_fasta(out_path, idx, sequence)
            domain_counter += 1

### Extract fasta from Deep predictions

In [7]:
in_path = "../data/Legionella_pneumophila_str._Philadelphia_1_effector_proteins_Deep-Output.tsv"
out_path = "../fastas/deep_preds.fasta"
deep_preds = pd.read_csv(in_path, sep="\t", index_col=0)
for i, row in tqdm(deep_preds.iterrows()):
    letters = parse_list(row["Letter"])
    m_positions = [i for i, letter in enumerate(letters) if letter == "M"]
    if m_positions:
        sequences = parse_list(row["aminoacids"])
        lengths = parse_list(row["length"])
        refseq_id = row["Uniprot-ID"]
        protein = row_by_refseq(refseq_id)
        
        domain_counter = 1
        for i in m_positions:
            idx = f">{row.name} - Domain {domain_counter}"
            
            extend_info_map(idx, sequences[i], lengths[i], protein)

            write_fasta(out_path, idx, sequences[i])
            domain_counter += 1

296it [00:00, 4835.57it/s]


### Bring fasta sequences into right order

In [8]:
all_affinities_df = pd.read_excel(os.path.join("..", "data", "ipredEMC_affinities_all-results.xlsx"), sheet_name=0)
for name in names:
    fasta_path = os.path.join("..", "fastas", f"{name}.fasta")
    backup_fasta_path = os.path.join("..", "fastas", f"{name}.bak")
    shutil.move(fasta_path, backup_fasta_path)

    with open(backup_fasta_path, 'r') as fasta_in:
        lines = fasta_in.readlines()
    for i, line in enumerate(lines):
        if line.startswith(">"):
            header = line[:-1]
            sequence = lines[i+1][:-1]
            search_header = " - ".join(header.split(" - ")[:-1]) + " "
            if (all_affinities_df[all_affinities_df["FASTA"] == search_header]["Orientation"] == "turn").all():
                sequence = list(sequence)
                sequence.reverse()
                sequence = "".join(sequence)
            write_fasta(fasta_path, header, sequence)

  warn(msg)
  warn(msg)


## Prediction

### Run EMC predictions

In [9]:
for name in names:
    process = subprocess.Popen(["python", "prediction.py", f"fastas/{name}.fasta"], cwd="..")
    process.wait()
    shutil.move("../predicted_affinity.csv", f"../predictions/{name}_affinities.tsv")

Prediction is done. Please check predicted_affinity.csv
Prediction is done. Please check predicted_affinity.csv


## Collect Results

In [10]:
for name in names:
    predictions = pd.read_csv(f"../predictions/{name}_affinities.tsv", sep="\t")
    for i, row in predictions.iterrows():
        idx = f">{row["TMD"]}"
        affinity = row["Affinity to EMC"]
        info_map[idx].update({"Affinity to EMC": affinity})

results = pd.DataFrame(info_map).transpose()
results.to_csv("../results/results.csv")