# National cancer institute database processing

In [None]:
import pandas as pd
import requests

## Import data

In [None]:
df = pd.read_csv('Biomarkers —Early Detection Research NetworkBiomarkers.csv')
print(df.head(10))
print(df.info())

                            Title     Type     Organ Phase
0                            fPSA  Protein  Prostate     3
1                          proPSA  Protein  Prostate     3
2                            BPSA  Protein  Prostate     2
3                          EPCA-2  Protein  Prostate     2
4  IHC and FISH for T2-ERG fusion  Protein  Prostate     2
5                            KLK4  Protein  Prostate     2
6                            KLK2  Protein  Prostate  1, 2
7                           AMACR  Protein  Prostate     1
8                            CDK7  Protein  Prostate     1
9                            FLNA  Protein  Prostate     1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627 entries, 0 to 626
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   627 non-null    object
 1   Type    627 non-null    object
 2   Organ   627 non-null    object
 3   Phase   627 non-null    object
dtypes: object(4)
memory usage:

In [None]:
df = df.rename(columns={'Title': 'name'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627 entries, 0 to 626
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    627 non-null    object
 1   Type    627 non-null    object
 2   Organ   627 non-null    object
 3   Phase   627 non-null    object
dtypes: object(4)
memory usage: 19.7+ KB


## Parse sequences

In [None]:
def fetch_protein_sequence(protein_name):
    """
    Search protein sequence by it's name
    """
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": f"{protein_name} AND organism_id:9606",  # Search only human proteins
        "fields": "accession,sequence",
        "format": "json",
        "size": 1
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        data = response.json()
        if data.get("results"):

            return data["results"][0]["sequence"]["value"]
        else:
            return None
    else:
        print(f"Ошибка: {response.status_code}")
        return None


In [None]:
df['content'] = df['name'].apply(fetch_protein_sequence)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627 entries, 0 to 626
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     627 non-null    object
 1   Type     627 non-null    object
 2   Organ    627 non-null    object
 3   Phase    627 non-null    object
 4   content  589 non-null    object
dtypes: object(5)
memory usage: 24.6+ KB


In [None]:
df = df.dropna(how='any')
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     589 non-null    object
 1   Type     589 non-null    object
 2   Organ    589 non-null    object
 3   Phase    589 non-null    object
 4   content  589 non-null    object
dtypes: object(5)
memory usage: 23.1+ KB


## Transform data to appropriate format

In [None]:
df['conditions'] = df['Organ'] + ' cancer phase(s) ' + df['Phase'].astype(str)
df.drop(columns=['Organ', 'Phase'], inplace=True)
df = df.rename(columns={'Type': 'class'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        589 non-null    object
 1   class       589 non-null    object
 2   content     589 non-null    object
 3   conditions  589 non-null    object
dtypes: object(4)
memory usage: 18.5+ KB


In [None]:
df = df.drop_duplicates(subset=['content', 'conditions'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 586 entries, 0 to 588
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        586 non-null    object
 1   class       586 non-null    object
 2   content     586 non-null    object
 3   conditions  586 non-null    object
dtypes: object(4)
memory usage: 22.9+ KB


## Checking for duplicates between databases

In [None]:
marker_df_unique_proteins = pd.read_csv('unique_protein_biomarkers.csv')
marker_df_unique_conditions = pd.read_csv('unique_conditions.csv')
marker_df_protein_condition = pd.read_csv('protein_condition.csv')

datasets = [marker_df_unique_proteins, marker_df_unique_conditions, marker_df_protein_condition]
for data in datasets:
  print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        200 non-null    object
 1   uniprot_id  200 non-null    object
 2   content     199 non-null    object
dtypes: object(3)
memory usage: 4.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739 entries, 0 to 738
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   conditions  739 non-null    object
dtypes: object(1)
memory usage: 5.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              801 non-null    object
 1   uniprot_id        801 non-null    object
 2   conditions        801 non-null    object
 3   indication_types  801 non-null    object
 4

In [None]:
common_seqs = set(df_unique_prot['content']).intersection(set(marker_df_unique_proteins['content']))
print("INtersection in column 'name':", common_seqs)

INtersection in column 'name': {'MPACRLGPLAAALLLSLLLFGFTLVSGTGAEKTGVCPELQADQNCTQECVSDSECADNLKCCSAGCATFCSLPNDKEGSCPQVNINFPQLGLCRDQCQVDSQCPGQMKCCRNGCGKVSCVTPNF', 'MVSQALRLLCLLLGLQGCLAAGGVAKASGGETRDMPWKPGPHRVFVTQEEAHGVLHRRRRANAFLEELRPGSLERECKEEQCSFEEAREIFKDAERTKLFWISYSDGDQCASSPCQNGGSCKDQLQSYICFCLPAFEGRNCETHKDDQLICVNENGGCEQYCSDHTGTKRSCRCHEGYSLLADGVSCTPTVEYPCGKIPILEKRNASKPQGRIVGGKVCPKGECPWQVLLLVNGAQLCGGTLINTIWVVSAAHCFDKIKNWRNLIAVLGEHDLSEHDGDEQSRRVAQVIIPSTYVPGTTNHDIALLRLHQPVVLTDHVVPLCLPERTFSERTLAFVRFSLVSGWGQLLDRGATALELMVLNVPRLMTQDCLQQSRKVGDSPNITEYMFCAGYSDGSKDSCKGDSGGPHATHYRGTWYLTGIVSWGQGCATVGHFGVYTRVSQYIEWLQKLMRSEPRPGVLLRAPFP', 'MRALTLLALLALAALCIAGQAGAKPSGAESSKGAAFVSKQEGSEVVKRPRRYLYQWLGAPVPYPDPLEPRREVCELNPDCDELADHIGFQEAYRRFYGPV', 'MGIPMGKSMLVLLTFLAFASCCIAAYRPSETLCGGELVDTLQFVCGDRGFYFSRPASRVSRRSRGIVEECCFRSCDLALLETYCATPAKSERDVSTPPTVLPDNFPRYPVGKFFQYDTWKQSTQRLRRGLPALLRARRGHVLAKELEAFREAKRHRPLIALPTQDPAHGGAPPEMASNRK', 'MKLLHVFLLFLCFHLRFCKVTYTSQEDLVEKKCLAKKYTHLSCDKVFCQPWQRCIEGTCVCKLPYQCPKNGTAVCATNRRS

In [None]:
print(len(common_seqs))

24


### Dropping rows with intersections

In [None]:
filtered_df = df[~df['content'].isin(common_seqs)]
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562 entries, 0 to 588
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        562 non-null    object
 1   class       562 non-null    object
 2   content     562 non-null    object
 3   conditions  562 non-null    object
dtypes: object(4)
memory usage: 22.0+ KB


## Saving files

In [None]:
filtered_df.to_csv('protein_condition_nci_db.csv', index=False)

In [None]:
df_unique_prot = filtered_df.copy()
df_unique_prot = df_unique_prot.drop(columns=['conditions'])
df_unique_prot = df_unique_prot.drop_duplicates(subset=['content'])
df_unique_prot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 0 to 587
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     558 non-null    object
 1   class    558 non-null    object
 2   content  558 non-null    object
dtypes: object(3)
memory usage: 17.4+ KB


In [None]:
df_unique_prot.to_csv('unique_proteins_nci_db.csv', index=False)

In [None]:
df_unique_conditions = filtered_df.copy()
df_unique_conditions = df_unique_conditions.drop(columns=['name', 'class', 'content'])
df_unique_conditions = df_unique_conditions.drop_duplicates(subset=['conditions'])
df_unique_conditions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85 entries, 0 to 588
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   conditions  85 non-null     object
dtypes: object(1)
memory usage: 1.3+ KB


In [None]:
df_unique_conditions.to_csv('unique_conditions_nci_db.csv', index=False)