# Clean Protein Data

In [20]:
import pandas as pd
from pathlib import Path
import re
from datetime import date
import numpy as np

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Open the data in a dataframe

In [93]:
file_path = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/raw/protein_data_2021-02-07')
df = pd.read_csv(file_path, sep=';')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Entry,Entry name,Protein names,Gene names,Sequence,Subcellular location [CC]
0,0,O95825,QORL1_HUMAN,Quinone oxidoreductase-like protein 1 (EC 1.-....,CRYZL1 4P11,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,1,Q9Y2J0,RP3A_HUMAN,Rabphilin-3A (Exophilin-1),RPH3A KIAA0985,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,2,Q13905,RPGF1_HUMAN,Rap guanine nucleotide exchange factor 1 (CRK ...,RAPGEF1 GRF2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,3,Q5TD94,RSH4A_HUMAN,Radial spoke head protein 4 homolog A (Radial ...,RSPH4A RSHL3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,4,Q9HA92,RSAD1_HUMAN,Radical S-adenosyl methionine domain-containin...,RSAD1,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


Delete the columns we don't need.
I.e: Unnamed, Entry, Entryy name, Protein names, Gene Names

In [94]:
df = df.drop(columns=['Unnamed: 0', 'Entry', 'Entry name', 'Protein names', 'Gene names'], axis=1)
df.head(5)

Unnamed: 0,Sequence,Subcellular location [CC]
0,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


Delete Entries with NaN values in Subcellular location [CC]

In [95]:
print(f'Dataframe size before deleting NaN values: {len(df)}.')
df = df.dropna()
print(f'Dataframe size after deleting NaN values: {len(df)}.')

Dataframe size before deleting NaN values: 20394.
Dataframe size after deleting NaN values: 16750.


The subcellular location column is not nicely formated at this moment. It starts with: 'SUBCELLULAR LOCATION' and some entries have multiple subcellular locations.

In this project, it is chosen to count the number of times a location is mentioned in a frequency table. After the frequency table is made. We use this table to determine which location stays, which will be the highest amount in the frequency table.

In [96]:
df['Subcellular location [CC]'][7]

'SUBCELLULAR LOCATION: Cell membrane {ECO:0000269|PubMed:28169360}; Peripheral membrane protein. Cell junction {ECO:0000269|PubMed:27380321}. Cell junction, adherens junction. Cell projection, lamellipodium. Cytoplasm. Cell junction, synapse, postsynapse {ECO:0000269|PubMed:15182672}. Cell junction, synapse, presynapse {ECO:0000269|PubMed:15182672}. Note=Targeting to cell-cell junctions which is CDH1-dependent is required for the pro-apoptotic activity. In a subset of CD4+ T-cells, colocalizes with CRTAM at the immunological synapse during the late phase of T-cell activation (By similarity). {ECO:0000250|UniProtKB:Q80U72}.'

We delete:
* 'SUBCELLULAR LOCATION:'
* Parts between {}. For this we use regular expression

In [97]:
regex = re.compile(r'''
(\{.*?\}) # Remove everything between brackets
| # OR
Note=.* # Remove everyting after NOTE=

''', re.VERBOSE)

In [98]:
# Replace the first part of the string with nothing
df['Subcellular location [CC]'] = df['Subcellular location [CC]'].str.replace('SUBCELLULAR LOCATION: ', '')

# Remove everything in between brackets ('{', '}')
df['Subcellular location [CC]'] = df['Subcellular location [CC]'].str.replace(regex, '')

df.head()

Unnamed: 0,Sequence,Subcellular location [CC]
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"Cytoplasmic vesicle, secretory vesicle, synapt..."
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Early endosome .
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"Cytoplasm, cytoskeleton, cilium axoneme . Cell..."
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion .
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Membrane ; Multi-pass membrane protein .


In [99]:
df['Subcellular location [CC]'][7]

'Cell membrane ; Peripheral membrane protein. Cell junction . Cell junction, adherens junction. Cell projection, lamellipodium. Cytoplasm. Cell junction, synapse, postsynapse . Cell junction, synapse, presynapse . '

Add all the possabilities in subcellular location CC in a list to make the frequency table. I see that some are separated by a comma (,) others by a semicolon (;) and others by a dot (.). So we should separate on everything.

Therefore, we will also use a regex

In [100]:
df['Subcellular location [CC]'][7]

'Cell membrane ; Peripheral membrane protein. Cell junction . Cell junction, adherens junction. Cell projection, lamellipodium. Cytoplasm. Cell junction, synapse, postsynapse . Cell junction, synapse, presynapse . '

In [101]:
regex = r'([\w|\s]*)'

In [102]:
# Set up a list to add the locations to
all_protein_locations = []

# Iterate over the dataframe entries
for index, row in df.iterrows():
    protein_locations = row['Subcellular location [CC]']
    
    # Use regex to find all locations which are stored in groups
    for match in re.finditer(regex, protein_locations):
        
        # Don't add to list if the group is empty
        if match.group(0).strip():
            all_protein_locations.append(match.group(0).strip())

In [103]:
def frequency_table(protein_location_list):
    # Create a frequency table from a list
    frequency = {}
    
    for location in protein_location_list:
        frequency.setdefault(location, 0)
        frequency[location] += 1
        
    return frequency

In [104]:
freq_table = frequency_table(all_protein_locations)

In [105]:
freq_table

{'Cytoplasmic vesicle': 552,
 'secretory vesicle': 172,
 'synaptic vesicle membrane': 53,
 'Cell projection': 1237,
 'dendritic spine': 58,
 'Cell junction': 931,
 'synapse': 528,
 'postsynaptic cell membrane': 121,
 'Membrane': 2050,
 'Peripheral membrane protein': 1298,
 'Early endosome': 125,
 'Cytoplasm': 6619,
 'cytoskeleton': 1700,
 'cilium axoneme': 80,
 'cilium': 266,
 'Mitochondrion': 571,
 'Multi': 3395,
 'pass membrane protein': 4163,
 'Cell membrane': 3597,
 'adherens junction': 68,
 'lamellipodium': 97,
 'postsynapse': 13,
 'presynapse': 28,
 'Endoplasmic reticulum membrane': 835,
 'Single': 2869,
 'pass type IV membrane protein': 101,
 'Mitochondrion membrane': 92,
 'COPII': 20,
 'coated vesicle membrane': 50,
 'Cytoplasmic side': 615,
 'cytosol': 500,
 'dendritic spine membrane': 7,
 'pass type I membrane protein': 1430,
 'flagellum': 67,
 'Perikaryon': 90,
 'axon': 144,
 'growth cone': 63,
 'filopodium': 39,
 'Nucleus': 5573,
 'Apical cell membrane': 156,
 'Secreted': 2

Now, for each entry we are going to adjust the subcellular location field to the value that is the highest in the frequency table.

In [106]:
regex = r'([\w|\s]*)'

In [107]:
# Iterate over the dataframe
for index, row in df.iterrows():
    most_occurencies = 0
    final_location = ''
    protein_locations = row['Subcellular location [CC]']
 
    # Use regex to find all locations which are stored in groups
    for match in re.finditer(regex, protein_locations):
        location = match.group(0).strip()
        
        # Don't search for the location if the match is empty
        if location:
        
            occurencies = freq_table[location]

            if occurencies > most_occurencies:
                most_occurencies = occurencies
                final_location = location

    df.loc[index,'Subcellular location [CC]'] = final_location

In [108]:
df.head()

Unnamed: 0,Sequence,Subcellular location [CC]
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Membrane
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Early endosome
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,pass membrane protein


In [109]:
def df_freq_table(data):
    freq = {}
    
    for index, row in data.iterrows():
        location = row['Subcellular location [CC]']
        freq.setdefault(location, 0)
        freq[location] += 1
    
    return freq

In [110]:
freq_table = df_freq_table(df)

In [111]:
total = 0
for key, value in freq_table.items():
    total += value
    
assert total == len(df)

In [112]:
freq_table

{'Membrane': 89,
 'Early endosome': 11,
 'Cytoplasm': 5229,
 'Mitochondrion': 328,
 'pass membrane protein': 3314,
 'Single': 877,
 'Nucleus': 3087,
 'Mitochondrion matrix': 121,
 'Cell membrane': 1371,
 'Secreted': 1440,
 'Golgi apparatus': 56,
 'Nucleus speckle': 26,
 'Cell junction': 43,
 'Peroxisome': 31,
 'Endosome': 9,
 'Cell projection': 92,
 'Peripheral membrane protein': 248,
 'Cytoplasmic vesicle': 39,
 'Lysosome': 45,
 'Chromosome': 36,
 'anchor': 30,
 'Lysosome membrane': 16,
 'Endoplasmic reticulum membrane': 14,
 'Early endosome membrane': 3,
 'Cytoplasmic granule': 5,
 'Cell surface': 2,
 'Mitochondrion intermembrane space': 12,
 'Endoplasmic reticulum': 43,
 'Endoplasmic reticulum lumen': 43,
 'Melanosome': 6,
 'Mitochondrion inner membrane': 18,
 'Mitochondrion outer membrane': 9,
 'Nucleus matrix': 7,
 'Nucleus membrane': 5,
 'Basal cell membrane': 1,
 'Virion': 5,
 'Apical cell membrane': 2,
 'Endosome membrane': 2,
 'Vesicle': 1,
 'Recycling endosome': 2,
 'Isoform 

From this list of classes, we can combine some classes together to remove redundant classes. We search in the text if it mentions a class and we change the location to that class.

In [113]:
def removeRedundant(x):
    redundant = ['Cell membrane', 'Cytoplasmic granule', 'Endosome', 
            'Golgi apparatus','Lysosome','Melanosome', 'Mitochondrion',
            'Nucleus', 'Perixome', 'Preautophagosomal', 'Rough endoplasmic reticulum',
             'Sarcoplasmic reticulum']
    
    x_lower = x.lower().strip()
    
    for red in redundant:
        
        if red.lower() in x_lower:
            return red
        
    return x

In [114]:
removeRedundant('mitochondrion inner membrane')

'Mitochondrion'

In [115]:
df['Subcellular location [CC]'] = df['Subcellular location [CC]'].map(removeRedundant)

In [116]:
freq_table = df_freq_table(df)

In [117]:
freq_table

{'Membrane': 89,
 'Endosome': 30,
 'Cytoplasm': 5229,
 'Mitochondrion': 492,
 'pass membrane protein': 3314,
 'Single': 877,
 'Nucleus': 3129,
 'Cell membrane': 1375,
 'Secreted': 1440,
 'Golgi apparatus': 58,
 'Cell junction': 43,
 'Peroxisome': 31,
 'Cell projection': 92,
 'Peripheral membrane protein': 248,
 'Cytoplasmic vesicle': 39,
 'Lysosome': 62,
 'Chromosome': 36,
 'anchor': 30,
 'Endoplasmic reticulum membrane': 14,
 'Cytoplasmic granule': 5,
 'Cell surface': 2,
 'Endoplasmic reticulum': 43,
 'Endoplasmic reticulum lumen': 43,
 'Melanosome': 6,
 'Virion': 5,
 'Vesicle': 1,
 'Isoform 1': 2,
 'Peroxisome matrix': 2,
 'Sarcoplasmic reticulum': 2,
 'Endomembrane system': 3,
 'Membrane raft': 2,
 '': 3,
 'Midbody': 1,
 'Lipid droplet': 2}

In [118]:
total = 0
for key, value in freq_table.items():
    total += value
    
assert total == len(df)

In [119]:
processed_dir = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/processed')
today = date.today()
filename = f'protein_data_{today}.csv'
file_path = Path(processed_dir, filename)

In [120]:
df.to_csv(file_path, sep=';', index=False)

## Classes according to paper

To have a better benchmark, we are going to classify the proteins based on the same classes as https://doi.org/10.1093/bioinformatics/btx431

In [158]:
paper_classes = ['Nucleus', 'Cytoplasm', 'Extracellular', 'Mitochondrion', 'Cell membrane', 'Endoplasmic reticulum', 'Plastid', 'Golgi apparatus', 'Lysosome/ Vacuole', 'Peroxisome', 'Endosome', 'Secreted', 'Lysosome', 'Vacuole']

In [159]:
file_path = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/raw/protein_data_2021-02-07')
df = pd.read_csv(file_path, sep=';')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Entry,Entry name,Protein names,Gene names,Sequence,Subcellular location [CC]
0,0,O95825,QORL1_HUMAN,Quinone oxidoreductase-like protein 1 (EC 1.-....,CRYZL1 4P11,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,1,Q9Y2J0,RP3A_HUMAN,Rabphilin-3A (Exophilin-1),RPH3A KIAA0985,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,2,Q13905,RPGF1_HUMAN,Rap guanine nucleotide exchange factor 1 (CRK ...,RAPGEF1 GRF2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,3,Q5TD94,RSH4A_HUMAN,Radial spoke head protein 4 homolog A (Radial ...,RSPH4A RSHL3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,4,Q9HA92,RSAD1_HUMAN,Radical S-adenosyl methionine domain-containin...,RSAD1,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


In [160]:
# Drop the unused columns
df.drop(columns=['Unnamed: 0', 'Entry', 'Entry name', 'Protein names', 'Gene names'], inplace=True)
df.head(5)

Unnamed: 0,Sequence,Subcellular location [CC]
0,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


In [161]:
# Drop the NAN version
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,Sequence,Subcellular location [CC]
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...


In [174]:
df['Subcellular location [CC]'] = df['Subcellular location [CC]'].str.strip()

df.head(5)

Unnamed: 0,Sequence,Subcellular location [CC],Location
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec...",
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...,
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,...",
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...,
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Cell membrane


In [204]:
df['Location'] = np.nan

In [205]:
def set_location(location):
    for protein_class in paper_classes:
        if protein_class.lower() in location.lower():
            if protein_class in ['Lysosome', 'Vacuole']:
                return 'Lysosome/ Vacuole'
            
            else:
                return protein_class          

In [206]:
df['Location'] = df['Subcellular location [CC]'].apply(set_location)

In [207]:
len(df)

16750

In [208]:
df.head(5)

Unnamed: 0,Sequence,Subcellular location [CC],Location
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec...",Cytoplasm
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...,Endosome
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,...",Cytoplasm
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...,Mitochondrion
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,


In [209]:
len(df[df['Location'].isna()])

1592

In [210]:
splitted_na = df[df['Location'].isna()]['Subcellular location [CC]'].str.split()

In [211]:
na_left = set()
for ind, row in enumerate(splitted_na, 0):
    na_left.add(row[2])

In [212]:
na_left

{'Cell',
 'Chromosome',
 'Chromosome,',
 'Endomembrane',
 'Lipid',
 'Melanosome',
 'Membrane',
 'Membrane,',
 'Membrane;',
 'Microsome',
 'Midbody,',
 'Note=Contrary',
 'Note=Localized',
 'Note=Located',
 'Perikaryon',
 'Photoreceptor',
 'Sarcoplasmic',
 'Vesicle',
 'Virion',
 'Virion.',
 '[Isoform'}

In [200]:
def set_membrane_to_cell_membrane(location):
    if 'membrane' in location.lower():
        return 'Cell membrane'
    else:
        return 

In [214]:
df_nan_location = df[df['Location'].isna()]

In [215]:
df_nan_location['Location'] = df['Subcellular location [CC]'].apply(set_membrane_to_cell_membrane)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [217]:
df_nan_location.head(5)

Unnamed: 0,Sequence,Subcellular location [CC],Location
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Cell membrane
10,MRRVLRLLLGCFLTELCARVCRAQERAGHGQLAQLGGVLLLAGGNR...,"SUBCELLULAR LOCATION: Cell projection, dendrit...",Cell membrane
16,MESGTSSPQPPQLDPLDAFPQKGLEPGDIAVLVLYFLFVLAVGLWS...,SUBCELLULAR LOCATION: Membrane {ECO:0000250|Un...,Cell membrane
34,MVSSPCTQASSRTCSRILGLSLGTAALFAAGANVALLLPNWDVTYL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Cell membrane
36,MADKVQTTLLFLAVGEFSVGILGNAFIGLVNCMDWVKKRKIASIDL...,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Cell membrane


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().