# Notes

Used data (setDeepLoc and setHARD) from paper "Light Attention Predicts Protein Location from the Language of Life" by Stärk et al. available at: https://github.com/HannesStark/protein-localization/tree/master/data_files

Original data from http://www.cbs.dtu.dk/services/DeepLoc/data.php.

**Data explanation**:
- setDeepLoc:
    - Train: 9503 proteins
    - Validation: 1158 proteins redundance reduced (<= 30% PIDE (pairwise sequence identity))
    - Test: 2768 proteins redundance reduce to the training set (<= 30% PIDE & E-values <= 10^-6)
- setHARD:
    - Test: 490 proteins representative of clusters at >= 20% PIDE of proteins from SwissProt (only eukaryote, no fragments, >= 40 residues and proteins deleting with >= 20% PIDE to all sets)


**Proposed splits**:
- From original paper data:
    - `mixed_soft`: deepLoc train + deepLoc validation + deepLoc test ---> Ready!
    - `mixed_hard`: deepLoc train + deepLoc validation + testHARD test ---> Ready!
- New proposals:
    - `human_soft`: deepLoc train + deepLoc validation + deepLoc test (only human proteins) ---> Ready!
    - `human_hard`: deepLoc train + deepLoc validation + testHARD test (only human proteins) ---> Ready!
    - `mixed_balanced`: TODO
    - `human_balanced`: TODO

# Configs & Imports

In [1]:
import re

from pathlib import Path

from pandas import DataFrame, read_csv

import numpy as np

from Bio import SeqIO

In [39]:
# Where the raw data is stored and where processed data will be deposited
data_path = Path('') / '..' / 'data' / 'subcellular_localization'
split_path = Path('') / '..' / 'splits' / 'scl'

# There are 4 datasets:
# From original the original deepLoc: one for training, one for validation and one for test
# From Stärk et al. paper: one for test
train_data_path = data_path / 'deeploc_our_train_set.fasta'
validation_data_path = data_path / 'deeploc_our_val_set.fasta'
test_data_path = data_path / 'deeploc_test_set.fasta'
final_test_data_path = data_path / 'setHARD.fasta'

# For the human data, we need to map gene names to UniProt accessions
# Then we use a TSV export from UniProt to map the sequence to the gene name # TODO
human_sequences_path = data_path / 'human_sequences.tsv'

# Obtain original datasets

In [40]:
# Function to encapsulate the reading of the four data files
def getProteinsFromFile(filePath):
    columns = ["id", "name", "location", "sequence"]
    dataset = DataFrame(columns = columns)
    
    for protein in SeqIO.parse(filePath, "fasta"):
        dataset = dataset.append({"id": protein.id,
                                            "name": protein.name,
                                            "location": protein.description.split(" ")[1],
                                            #"Location": re.search(r'[a-zA-Z]*\.([a-zA-Z]*)-[a-zA-Z]*', record.description).group(1), 
                                            "sequence": str(protein.seq)}, ignore_index=True)
    
    return dataset

In [41]:
# Obtain data from sets
train_set_full = getProteinsFromFile(train_data_path)
validation_set_full = getProteinsFromFile(validation_data_path)
test_set_full = getProteinsFromFile(test_data_path)
test_set_hard = getProteinsFromFile(final_test_data_path)

# Obtain human data
human_sequences_data = read_csv(human_sequences_path, sep='\t')

In [42]:
# Simplificate classes (e.g. Nucleus-S and Nucleus-M to Nucleus)
modification = lambda row : row["location"].split("-")[0].replace(".", " ")

train_set_full["location"] = train_set_full.apply(modification, axis = 1)
validation_set_full["location"] = validation_set_full.apply(modification, axis = 1)
test_set_full["location"] = test_set_full.apply(modification, axis = 1)
test_set_hard["location"] = test_set_hard.apply(modification, axis = 1)

# From original paper data

## mixed_soft (deepLoc train + deepLoc validation + deepLoc test)

In [5]:
mixed_soft = DataFrame(columns = ["sequence", "target", "set", "validation"])

# Add train set (deepLoc)
tmp = train_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(train_set_full))
mixed_soft = mixed_soft.append(tmp)

# Add validation set (deepLoc)
tmp = validation_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(validation_set_full))
tmp.insert(2, "validation", [True]*len(validation_set_full))
mixed_soft = mixed_soft.append(tmp)

# Add test set (deepLoc)
tmp = test_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(test_set_full))
mixed_soft = mixed_soft.append(tmp)

mixed_soft = mixed_soft.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [8]:
mixed_soft.to_csv(split_path / 'splits' / 'mixed_soft.cvs', index = False)

## mixed_hard (deepLoc train + deepLoc validation + testHARD test)

In [9]:
mixed_hard = DataFrame(columns = ["sequence", "target", "set", "validation"])

# Add train set (deepLoc)
tmp = train_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(train_set_full))
mixed_hard = mixed_hard.append(tmp)

# Add validation set (deepLoc)
tmp = validation_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(validation_set_full))
tmp.insert(2, "validation", [True]*len(validation_set_full))
mixed_hard = mixed_hard.append(tmp)

# Add test set (setHARD)
tmp = test_set_hard[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(test_set_hard))
mixed_hard = mixed_hard.append(tmp)

mixed_hard = mixed_hard.reset_index(drop = True)

In [10]:
mixed_hard.to_csv(split_path / 'splits' / 'mixed_hard.cvs', index = False)

# New proposals

## human_soft (deepLoc train + deepLoc validation + deepLoc test (only human proteins))

In [11]:
human_soft = DataFrame(columns = ["sequence", "target", "set", "validation"])

# Add train set (deepLoc)
tmp = train_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(train_set_full))
human_soft = human_soft.append(tmp)

# Add validation set (deepLoc)
tmp = validation_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(validation_set_full))
tmp.insert(2, "validation", [True]*len(validation_set_full))
human_soft = human_soft.append(tmp)

In [19]:
# Get human protein names
human_protein_names = set(human_sequences_data['Entry'])

# Get human proteins from test_set_full if human proteins
human_in_test_set_full = test_set_full.loc[test_set_full['name'].isin(human_protein_names)]

In [20]:
# Add test set (deepLoc)
tmp = human_in_test_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(human_in_test_set_full))
human_soft = human_soft.append(tmp)

human_soft = human_soft.reset_index(drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [22]:
human_soft.to_csv(split_path / 'splits' / 'human_soft.cvs', index = False)

## human_hard (deepLoc train + deepLoc validation + testHARD test (only human proteins))

In [23]:
human_hard = DataFrame(columns = ["sequence", "target", "set", "validation"])

# Add train set (deepLoc)
tmp = train_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(train_set_full))
human_hard = human_hard.append(tmp)

# Add validation set (deepLoc)
tmp = validation_set_full[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(validation_set_full))
tmp.insert(2, "validation", [True]*len(validation_set_full))
human_hard = human_hard.append(tmp)

In [24]:
# Get human proteins from test_set_hard if human proteins
human_in_test_set_hard = test_set_hard.loc[test_set_hard['name'].isin(human_protein_names)]

In [25]:
# Add test set (setHARD)
tmp = human_in_test_set_hard[["sequence", "location"]]
tmp.rename(columns = {"location": "target"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(human_in_test_set_hard))
human_hard = human_hard.append(tmp)

human_hard = human_hard.reset_index(drop = True)

In [26]:
human_hard.to_csv(split_path / 'splits' / 'human_hard.cvs', index = False)

## balanced

In [49]:
# Let's take a look at the value counts for the classes in train_set_full (other sets are proportionals)
train_set_full['location'].value_counts()

Nucleus                  2752
Cytoplasm                1862
Extracellular            1322
Mitochondrion            1003
Cell membrane             906
Endoplasmic reticulum     594
Plastid                   511
Golgi apparatus           238
Lysosome/Vacuole          214
Peroxisome                101
Name: location, dtype: int64