# Clean Protein Data

In [25]:
import pandas as pd
from pathlib import Path
import re
from datetime import date

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Open the data in a dataframe

In [3]:
file_path = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/raw/protein_data_2021-02-07')
df = pd.read_csv(file_path, sep=';')
df.head(5)

Unnamed: 0.1,Unnamed: 0,Entry,Entry name,Protein names,Gene names,Sequence,Subcellular location [CC]
0,0,O95825,QORL1_HUMAN,Quinone oxidoreductase-like protein 1 (EC 1.-....,CRYZL1 4P11,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,1,Q9Y2J0,RP3A_HUMAN,Rabphilin-3A (Exophilin-1),RPH3A KIAA0985,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,2,Q13905,RPGF1_HUMAN,Rap guanine nucleotide exchange factor 1 (CRK ...,RAPGEF1 GRF2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,3,Q5TD94,RSH4A_HUMAN,Radial spoke head protein 4 homolog A (Radial ...,RSPH4A RSHL3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,4,Q9HA92,RSAD1_HUMAN,Radical S-adenosyl methionine domain-containin...,RSAD1,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


Delete the columns we don't need.
I.e: Unnamed, Entry, Entryy name, Protein names, Gene Names

In [4]:
df = df.drop(columns=['Unnamed: 0', 'Entry', 'Entry name', 'Protein names', 'Gene names'], axis=1)
df.head(5)

Unnamed: 0,Sequence,Subcellular location [CC]
0,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


Delete Entries with NaN values in Subcellular location [CC]

In [5]:
print(f'Dataframe size before deleting NaN values: {len(df)}.')
df = df.dropna()
print(f'Dataframe size after deleting NaN values: {len(df)}.')

Dataframe size before deleting NaN values: 20394.
Dataframe size after deleting NaN values: 16750.


The subcellular location column is not nicely formated at this moment. It starts with: 'SUBCELLULAR LOCATION' and some entries have multiple subcellular locations.

In this project, it is chosen to count the number of times a location is mentioned in a frequency table. After the frequency table is made. We use this table to determine which location stays, which will be the highest amount in the frequency table.

In [6]:
df['Subcellular location [CC]'][4]

'SUBCELLULAR LOCATION: Mitochondrion {ECO:0000305}.'

We delete:
* 'SUBCELLULAR LOCATION:'
* Parts between {}. For this we use regular expression

In [7]:
regex = re.compile(r'''
(\{.*\}) # Remove everything in between brackets.
''', re.VERBOSE)

In [8]:
# Replace the first part of the string with nothing
df['Subcellular location [CC]'] = df['Subcellular location [CC]'].str.replace('SUBCELLULAR LOCATION: ', '')

# Remove everything in between brackets ('{', '}')
df['Subcellular location [CC]'] = df['Subcellular location [CC]'].str.replace(regex, '')

df.head()

Unnamed: 0,Sequence,Subcellular location [CC]
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"Cytoplasmic vesicle, secretory vesicle, synapt..."
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Early endosome .
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"Cytoplasm, cytoskeleton, cilium axoneme ."
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion .
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Membrane .


Add all the possabilities in subcellular location CC in a list to make the frequency table. I see that some are separated by a comma (,) others by a semicolon (;) and others by a dot (.). So we should separate on everything.

Therefore, we will also use a regex

In [9]:
df['Subcellular location [CC]'][3]

'Cytoplasm, cytoskeleton, cilium axoneme .'

In [10]:
regex = r'([\w|\s]*)'

In [11]:
# Set up a list to add the locations to
all_protein_locations = []

# Iterate over the dataframe entries
for index, row in df.iterrows():
    protein_locations = row['Subcellular location [CC]']
    
    # Use regex to find all locations which are stored in groups
    for match in re.finditer(regex, protein_locations):
        
        # Don't add to list if the group is empty
        if match.group(0).strip():
            all_protein_locations.append(match.group(0).strip())

In [12]:
def frequency_table(protein_location_list):
    # Create a frequency table from a list
    frequency = {}
    
    for location in protein_location_list:
        frequency.setdefault(location, 0)
        frequency[location] += 1
        
    return frequency

In [13]:
freq_table = frequency_table(all_protein_locations)

Now, for each entry we are going to adjust the subcellular location field to the value that is the highest in the frequency table.

In [14]:
regex = r'([\w|\s]*)'

In [15]:
# Iterate over the dataframe
for index, row in df.iterrows():
    most_occurencies = 0
    final_location = ''
    protein_locations = row['Subcellular location [CC]']
 
    # Use regex to find all locations which are stored in groups
    for match in re.finditer(regex, protein_locations):
        location = match.group(0).strip()
        
        # Don't search for the location if the match is empty
        if location:
        
            occurencies = freq_table[location]

            if occurencies > most_occurencies:
                most_occurencies = occurencies
                final_location = location

    df.loc[index,'Subcellular location [CC]'] = final_location

In [16]:
df.head()

Unnamed: 0,Sequence,Subcellular location [CC]
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Cytoplasmic vesicle
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Early endosome
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion
6,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Membrane


In [17]:
def df_freq_table(data):
    freq = {}
    
    for index, row in data.iterrows():
        location = row['Subcellular location [CC]']
        freq.setdefault(location, 0)
        freq[location] += 1
    
    return freq

In [20]:
freq_table = df_freq_table(df)

In [24]:
total = 0
for key, value in freq_table.items():
    total += value
    
assert total == len(df)