<h2>Filtering UniProt Data</h2>

<p>We want to filter the file: 'UniProt_human_variants_reduced.csv', to only include reviewed UniProt entries with a evidence at the protein level.</p> 

<span>The list of UniProt proteins with evidence at the protein level can be found in: 'uniprot_human_reviewed.csv'.</span>


In [61]:
import os

# establish files path
FILES_PATH = os.path.join(os.environ.get('PYTHONPATH'),'files')

<p>We first load both files into a dataframe</p>

In [62]:
import pandas as pd

humanVariants_path = os.path.join(FILES_PATH, 'uniprot_variants_test.csv')
reviewedProteins_path = os.path.join(FILES_PATH, 'uniprot_human_reviewed.csv')

humanVariants_df = pd.read_csv(humanVariants_path, sep='\t')
reviewedProteins_df = pd.read_csv(reviewedProteins_path, sep='\t')

<p>
   humanVariants_df has the columns
   <ul>
      <li>Gene Names</li>
      <li>AC</li>
      <li>Variant AA Change</li>
      <li>Source DB ID</li>
      <li>Consequence Type</li>
      <li>Clinical Significance</li>
      <li>Evidence</li>
   </ul>
</p>

<p>
   reviewedProteins_df has the columns
   <ul>
      <li>Entry</li>
      <li>Entry Name</li>
      <li>Gene Names</li>
      <li>Length</li>
   </ul>
</p>

In [84]:
'''
Create a new dataframe where:
- humanVariants_df['AC'] == reviewedProteins_df['Entry']
- humanVariants_df['Consequence Type'] == 'missense variant'
- humanVariants_df['Clinical Significance'] != anything ambiguous

The new dataframe will have columns of:
- Entry
- Entry Name
- Gene Names
- Length
- Variant AA Change
- Source DB ID
- Clinical Significance
- Evidence
'''
import re

# assemble reviewed proteins into a dictionary
REVIEWED_PROTEINS = {}

for idx, protein in reviewedProteins_df.loc[:, 'Entry'].items():
    REVIEWED_PROTEINS[protein] = idx

# reviewedProteins_df.set_index('Entry')

REVIEWED_PROTEINS_COLS = reviewedProteins_df.columns
# print(humanVariants_df.columns)

FILTERED_DF_COL = [
    'Variant AA Change',
    'Source DB ID',
    'Clinical Significance',
    'Evidence',
]

filtered_df = pd.DataFrame(columns=FILTERED_DF_COL + list(REVIEWED_PROTEINS_COLS))

clinicalSigRegex = r'\b(conflicting|uncertain|Conflicting|Uncertain)\b'

for idx, row in humanVariants_df.iterrows():

    rowDict = {}

    # CHECK FOR
    # - humanVariants_df['AC'] == reviewedProteins_df['Entry']
    # - humanVariants_df['Consequence Type'] == 'missense variant'
    # - humanVariants_df['Clinical Significance'] != anything ambiguous

    # print(row['AC'])

    try:
        reviewedProteinsIdx = REVIEWED_PROTEINS[row['AC']]
        # print(row['Consequence Type'] == 'missense variant')
        # print(bool(re.search(clinicalSigRegex, row['Clinical Significance'])) == False)

        if row['Consequence Type'] == 'missense variant' and bool(re.search(clinicalSigRegex, row['Clinical Significance'])) == False:

            for col1 in REVIEWED_PROTEINS_COLS:
                rowDict[col1] = reviewedProteins_df.loc[reviewedProteinsIdx, col1]
                
            for col2 in FILTERED_DF_COL:
                rowDict[col2] = row[col2]
            
            filtered_df.append()
            
    except KeyError:
        pass

print(filtered_df)

AttributeError: 'DataFrame' object has no attribute 'append'