# Project 4 - Data Analysis and Web Scraping #

In [1]:
import os
import csv
import pandas as pd
import time

def PRINT() -> None: print('-'*80)

In [3]:
df = pd.read_csv("TESMP.csv", sep=';') 

In [4]:
df

Unnamed: 0,ChEMBL ID,Name,Synonyms,Type,Max Phase,Molecular Weight,Targets,Bioactivities,AlogP,Polar Surface Area,...,Heavy Atoms,HBA (Lipinski),HBD (Lipinski),#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),Np Likeness Score,Molecular Species,Molecular Formula,Smiles,Inchi Key
0,CHEMBL81990,,,Small molecule,,505.79,4,9,5.63,95.5,...,33,6,3,2,504.041,-0.78,ACID,C24H19Cl3N2O4,Cc1cccc(Cl)c1C(=O)N[C@@H](Cc1ccc(NC(=O)c2c(Cl)...,JTVWJXUGGJTGDA-IBGZPJMESA-N
1,CHEMBL3959435,,,Small molecule,,385.45,5,27,1.78,105.49,...,27,7,3,0,385.1096,-0.89,NEUTRAL,C19H19N3O4S,COc1ccc(S(=O)(=O)N2Cc3[nH]c4ccccc4c3CC2C(N)=O)cc1,KBVPHVCPUVDXOQ-UHFFFAOYSA-N
2,CHEMBL238779,,,Small molecule,,524.6,6,8,2.06,144.91,...,37,10,4,1,524.173,-1.24,ACID,C26H28N4O6S,O=C(NCc1ccccc1)NC[C@H](NC(=O)[C@@H]1CCCN1S(=O)...,WQVOUYNTLLCWOA-GOTSBHOMSA-N
3,CHEMBL429088,,,Small molecule,,474.54,6,8,0.91,144.91,...,33,10,4,0,474.1573,-1.33,ACID,C22H26N4O6S,O=C(NCc1ccccc1)NC[C@H](NC(=O)[C@@H]1CCCN1S(=O)...,ZXRLVUCOFJPEID-OALUTQOASA-N
4,CHEMBL3114860,,,Small molecule,,387.82,1,1,3.09,84.86,...,27,6,2,0,387.0874,1.06,ACID,C20H18ClNO5,O=C1N[C@H](C(=O)O)Cc2cccc(c2)OC/C=C/COc2ccc1c(...,KBMXMIHYKVQSHP-KROLTMCQSA-N
5,CHEMBL3115072,,,Small molecule,,377.83,1,1,2.52,93.45,...,26,7,2,0,377.1142,0.26,ACID,C18H20ClN3O4,O=C1N[C@H](C(=O)O)Cc2cn(cn2)CCCCCOc2ccc(Cl)c1c2,VOZWZDMFSQVZQP-INIZCTEOSA-N
6,CHEMBL447509,,,Small molecule,,810.95,9,34,4.11,250.81,...,59,16,9,3,810.4065,-0.7,ACID,C43H54N8O8,Cc1ccccc1NC(=O)Nc1ccc(CC(=O)N[C@@H](CCCCNC(=O)...,OOYFOUHICCOHRQ-CVLSMVSNSA-N
7,CHEMBL393213,,,Small molecule,,543.43,6,8,2.22,144.91,...,35,10,4,1,542.0794,-1.38,ACID,C22H24Cl2N4O6S,O=C(NCc1ccccc1)NC[C@H](NC(=O)[C@@H]1CCCN1S(=O)...,NYGQATJDYMZDLT-OALUTQOASA-N
8,CHEMBL401237,,,Small molecule,,489.55,6,8,0.49,170.93,...,34,11,6,2,489.1682,-1.29,ACID,C22H27N5O6S,Nc1ccc(CNC(=O)NC[C@H](NC(=O)[C@@H]2CCCN2S(=O)(...,QKMRLEQVMNWNFD-OALUTQOASA-N
9,CHEMBL3115050,,,Small molecule,,398.37,1,1,2.35,128.0,...,29,9,2,0,398.1114,0.81,ACID,C20H18N2O7,O=C1N[C@H](C(=O)O)Cc2cccc(c2)OC/C=C/COc2ccc([N...,GSBDJNSXYKUYNP-IMLGJKEDSA-N


In [5]:
df.to_csv("TESMP_no_delim.csv", index=False)

## Import essential libraries ## 

In [2]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

In [7]:
os.environ['PATH'] += r'C:\Users\gavvi\ChromeDrivers\chrome-win64\chrome-win64\chrome.exe'

## Preforming Data Scraping Techniques to Extract the Dataset ##

### Web Scraping Setup ###

The first step is to open the chrome website and navigate to *chEMBL* database official website and enter in the *search* location `integrins` in order to get the wanted data for our mission.

In [8]:
# Open chrome website
driver = webdriver.Chrome()

# Use the correct URL in orer to navigate to the correct dataset in chEMBL database website
url = 'https://www.ebi.ac.uk/chembl/'
driver.get(url)

driver.implicitly_wait(5) # wait 5sec in case there are server issues

The next step is to type `Integrins` in the search box and press enter in order to execute the search.

In [9]:
# search for integrins
search_input = driver.find_element("name","search-str")

# type Integrings into the search field
field = "Integrins"
search_input.send_keys(field)

# press enter to preform the search
search_input.send_keys(Keys.ENTER)

Next, we want to move to the `Targets` window. Thus, we need to enter on the button that will take as to the correct window e.g. press on `Targets` button

In [10]:
target_btn = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, '[data-resource-key="TARGET"]'))
)
target_btn.click()

driver.implicitly_wait(5) # wait 5sec in case there are server issues

The final data filtering step we have is to take only the next types:

- PROTEIN COMPLEX
- PROTEIN COMPLEX GROUP
- PROTEIN-PROTEIN INTERACTION
- SELECTIVITY GROUP 

As before, we will achive that by pressing the corresponding buttons that will filter the correct data

In [11]:
# scroll down by 1900 pixels so that the buttons will be visible
driver.execute_script("window.scrollBy(0, 1500);")

# Wait for 5 seconds
time.sleep(5)

driver.execute_script("window.scrollBy(0, 400);")

# Wait for 5 seconds
time.sleep(5)

In [12]:
filter_n1_btn = WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '.front-bar[data-facet-group-key="target_type"][data-facet-key="PROTEIN COMPLEX"]'))
) 
filter_n1_btn.click()

filter_n2_btn = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, 'bucket[data-bucket-key="PROTEIN COMPLEX GROUP"]'))
) 
filter_n2_btn.click()

filter_n3_btn = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, 'bucket[data-bucket-key="PROTEIN-PROTEIN INTERACTION"]'))
) 
filter_n3_btn.click()

filter_n4_btn = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, 'bucket[data-bucket-key="SELECTIVITY GROUP"]'))
) 
filter_n4_btn.click()


## Manually downloading all of compounds csv files ##

## Extraxting the csv files from the downloaded zipped files ##

In [12]:
import os
import shutil

zip_folder_path = 'Integrins - common Compounds zips'
csv_folder_path = 'Integrins - common Compounds csv'

# Get a list of all CSV files in folder1
csv_files = [file for file in os.listdir(zip_folder_path) if file.endswith('.zip')]

# Move each CSV file from folder1 to folder2
for csv_file in csv_files:
    source_path = os.path.join(zip_folder_path, csv_file)
    destination_path = os.path.join(csv_folder_path, csv_file)
    shutil.move(source_path, destination_path)


In [15]:
import zipfile

zip_folder_path = 'Integrins - common Compounds zips'
csv_folder_path = 'Integrins - common Compounds csv'

# Get a list of all zip files in
zip_files = [file for file in os.listdir(zip_folder_path) if file.endswith('.zip')]

# Extract each zip file from folder1 to folder2
for zip_file in zip_files:
    zip_file_path = os.path.join(zip_folder_path, zip_file)
    destination_path = csv_folder_path

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(destination_path)

# Optional: Remove the original zip files from folder1
for zip_file in zip_files:
    zip_file_path = os.path.join(zip_folder_path, zip_file)
    os.remove(zip_file_path)


In [18]:
csv_files = [file for file in os.listdir(csv_folder_path) if file.endswith('.csv')]

# Rename each CSV file in the csv folder to the next format "compunds_csv_#.csv"
for index, csv_file in enumerate(csv_files, start=1):
    original_path = os.path.join(csv_folder_path, csv_file)
    new_name = f'compounds_csv_{index}.csv'
    new_path = os.path.join(csv_folder_path, new_name)

    # Rename the file
    os.rename(original_path, new_path)
    
    PRINT()
    print(f'Renamed: {csv_file} -> {new_name}')
    
PRINT()

--------------------------------------------------------------------------------
Renamed: DOWNLOAD-059I13K12TScZl0EFY7tLQJ1vkunYd41aj_7o-5Ynug=.csv -> compounds_csv_1.csv
--------------------------------------------------------------------------------
Renamed: DOWNLOAD-1SZuCqnFo4Hyl0_RW_b8V-VaNwjdAW39Vca6q4QlyoQ=.csv -> compounds_csv_2.csv
--------------------------------------------------------------------------------
Renamed: DOWNLOAD-7n9tnYbQOgZEP3RO5_Jmch3b9fufYeOF3wPu1Q-SA6E=.csv -> compounds_csv_3.csv
--------------------------------------------------------------------------------
Renamed: DOWNLOAD-830C2TxV6Ia1KTNOKQv9z1Tw_UzoEMK0NO6iF9fUSa8=.csv -> compounds_csv_4.csv
--------------------------------------------------------------------------------
Renamed: DOWNLOAD-8ZChRghLVfQ0VHRva01imRqr2Peu72M-koak6cRr7ho=.csv -> compounds_csv_5.csv
--------------------------------------------------------------------------------
Renamed: DOWNLOAD-9Rr-9NGqABONMw4vrot8FB5rEEVa2vVO7NVLQn9un14=.c

Next, we will visualize random `csv` file from *Integrins - common Compounds csv* directory

In [24]:
# Get the csv directory path
csv_directory_path = 'Integrins - common Compounds csv'

# Get the first csv file path
first_csv_path = os.path.join(csv_directory_path, 'compounds_csv_1.csv')

df = pd.read_csv(first_csv_path)

df.head(5)

Unnamed: 0,"ChEMBL ID;""Name"";""Synonyms"";""Type"";""Max Phase"";""Molecular Weight"";""Targets"";""Bioactivities"";""AlogP"";""Polar Surface Area"";""HBA"";""HBD"";""#RO5 Violations"";""#Rotatable Bonds"";""Passes Ro3"";""QED Weighted"";""CX Acidic pKa"";""CX Basic pKa"";""CX LogP"";""CX LogD"";""Aromatic Rings"";""Structure Type"";""Inorganic Flag"";""Heavy Atoms"";""HBA (Lipinski)"";""HBD (Lipinski)"";""#RO5 Violations (Lipinski)"";""Molecular Weight (Monoisotopic)"";""Np Likeness Score"";""Molecular Species"";""Molecular Formula"";""Smiles"";""Inchi Key"""
0,"CHEMBL218200;"""";"""";""Small molecule"";"""";""488.50..."
1,"CHEMBL218963;"""";"""";""Small molecule"";"""";""473.53..."
2,"CHEMBL1836351;"""";"""";""Small molecule"";"""";""429.9..."
3,"CHEMBL218226;"""";"""";""Small molecule"";"""";""412.40..."
4,"CHEMBL1098726;"""";"""";""Small molecule"";"""";""536.3..."


From the data frame, it seems we need to specify the delimiter `;` in order to get the table with the data in more readable way

In [25]:
df = pd.read_csv(first_csv_path, sep=';')

df.head(5)

Unnamed: 0,ChEMBL ID,Name,Synonyms,Type,Max Phase,Molecular Weight,Targets,Bioactivities,AlogP,Polar Surface Area,...,Heavy Atoms,HBA (Lipinski),HBD (Lipinski),#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),Np Likeness Score,Molecular Species,Molecular Formula,Smiles,Inchi Key
0,CHEMBL218200,,,Small molecule,,488.5,2,2,3.44,150.67,...,36,10,4,0,488.1696,-1.0,ZWITTERION,C26H24N4O6,O=C(O)CC(NC(=O)c1cc(C(=O)Nc2ccc3c(c2)CNCC3)cc(...,JCOVTLVBRHTAIH-UHFFFAOYSA-N
1,CHEMBL218963,,,Small molecule,,473.53,3,4,3.54,116.76,...,35,8,4,0,473.1951,-0.74,ZWITTERION,C27H27N3O5,COc1ccc(C(CC(=O)O)NC(=O)c2cccc(C(=O)Nc3ccc4c(c...,VHKHAQYPTPVIAP-UHFFFAOYSA-N
2,CHEMBL1836351,,,Small molecule,,429.9,3,4,2.4,98.74,...,29,7,3,0,393.1689,-0.6,ACID,C22H24ClN3O4,CC(CC(=O)O)N1Cc2ccc(NC(=O)c3ccc4c(c3)CNCC4)cc2...,PLPQUBBLGBDIKT-UHFFFAOYSA-N
3,CHEMBL218226,,,Small molecule,,412.4,2,2,1.7,150.67,...,30,10,4,0,412.1383,-1.15,ZWITTERION,C20H20N4O6,O=C(O)CCNC(=O)c1cc(C(=O)Nc2ccc3c(c2)CNCC3)cc([...,VGPKJRDGGNBIGH-UHFFFAOYSA-N
4,CHEMBL1098726,,,Small molecule,,536.38,21,56,4.4,117.84,...,37,9,1,1,535.0814,-1.06,ACID,C26H19Cl2N5O4,CN1C(=O)N(c2cc(Cl)cc(Cl)c2)C(=O)[C@]12CN(c1ccc...,LILGMDXLRPEBNH-HFZDXXHNSA-N


From all of the columns in the data frame we got, we want to keep only the `SMILES` column in order to preforme *join* later on with the data frame we got from the previous project (i.e. project 3) in order to merge the new data (i.e. target pref names, uniprot ids (1 & 2) and organism name)

## Reading and Dropping Data from CSV Files ##

The next steps include:

- Reading the csv file into data frames with specification of delimiter equal to ';'
- Dropping all of the columns except the column which contains the molecules SMILES in each and every data frame

In [3]:
source_directory_path = 'Integrins - common Compounds csv'
target_directory_path = "Integrins - common Compounds csv - SMILES only"

In [4]:
!pwd

/cygdrive/c/Users/gavvi/Desktop/Programming/GitHub/DeepLearningResearchStarship/Project 3 Web Scraping and Data Analysis


In [6]:
for indx, csv in enumerate(os.listdir(source_directory_path), start=1):
    
    # Read the csv file into data frame with specification of the correct delimiter (i.e. ';')
    curr_df = pd.read_csv(os.path.join(source_directory_path, csv), sep=';')
    
    # Keep only the column named 'Smiles', and drop all of the other columns 
    curr_df.drop(curr_df.columns.difference(['Smiles']), axis=1, inplace=True)

    # Save the currect format csv file to target directory
    curr_df.to_csv(os.path.join(target_directory_path, f'compounds_SMILES_{indx}.csv'), index=False)
    

## Extract the Main Integrins Table and Merge with Privious CSVs ##

The corresponding step involves extracting the main Integrins table, which includes data such as the name (i.e., the target preferred name that we want to extract), UniProts 1 and 2 (which can be multiple), organism, and merging that with the CSV files.

The merging step: Each row in the main data frame corresponds to a CSV file containing *SMILES* (i.e. simplified molecular-input line-entry system) values for that row. This implies that each row in the main data frame will appear a number of times equal to the number of rows in its corresponding CSV file. The only data that will change is the unique SMILES value for that current row.

### Open chEMBL Database Site ###

First, we need to reopen the *chEMBL* database internet site and navigate to the correct *Integrins* filtered table as in the steps before. In order to achive that, we can just rerun the cells above that navigate us to the currect point and filter for us all of the wanted data using *selenium* scraping library.

### Extract the Main Data Frame Table ###

We can easily extract the data frame by pressing on the *csv* button and then on the *here* button in order to download the csv file.

Note that we dont need to preforme data scraping steps because manually downloading is much easier and contains only few simple steps.

### Merge Step ###

Finally, we can merge the main data frame with all of the other data frames which contains the *SMILES* of the molecules as mentioned before. 

In [55]:
main_csv = "Integrins_main_data_frame.csv"
compounds_SMILES_dir = "Integrins - common Compounds csv - SMILES only"

In [56]:
df = pd.read_csv(main_csv, sep=';')

df.head(10)

Unnamed: 0,ChEMBL ID,Name,UniProt Accessions,Type,Organism,Compounds,Activities,Tax ID,Species Group Flag
0,CHEMBL3430893,Integrin alpha-V/alpha-5,P06756|P08648,PROTEIN COMPLEX,Homo sapiens,6,6,9606,False
1,CHEMBL3137268,Integrin alpha2/beta1,P05556|P17301,PROTEIN COMPLEX,Homo sapiens,57,62,9606,False
2,CHEMBL3137286,EZH2/SUZ12/EED complex,Q15910|O75530|Q15022,PROTEIN COMPLEX,Homo sapiens,10,16,9606,False
3,CHEMBL2111481,Integrin alpha-4/beta-7,Q00651|P26011,PROTEIN COMPLEX,Mus musculus,22,55,10090,False
4,CHEMBL4106121,MAC1-CD40L,P05107|P11215|P29965,PROTEIN-PROTEIN INTERACTION,Homo sapiens,6,6,9606,False
5,CHEMBL3137278,Integrin alpha1/beta1 complex,P05556|P56199,PROTEIN COMPLEX,Homo sapiens,6,6,9606,False
6,CHEMBL3883284,Integrin alpha-3/beta-3,P05106|P26006,PROTEIN COMPLEX,Homo sapiens,25,47,9606,False
7,CHEMBL2111443,Integrin alpha-V/beta-3 and alpha-IIb/beta 3,P05106|P06756|P08514,SELECTIVITY GROUP,Homo sapiens,49,49,9606,False
8,CHEMBL4748218,VHL/Polycomb protein EED,O75530|P40337,PROTEIN-PROTEIN INTERACTION,Homo sapiens,2,6,9606,False
9,CHEMBL3301388,EZH2/SUZ12/EED/RBBP7/RBBP4,Q15910|O75530|Q15022|Q09028|Q16576,PROTEIN COMPLEX,Homo sapiens,7,11,9606,False


We have observed an issue regarding the sequence of lines in the main data frame. The arrangement of the rows does not align with the order on the chEMBL site. This misalignment may stem from a hiccup during the website download step, causing a reshuffling of the rows.

As a result, we must rearrange the rows to match the sequence on the chEMBL website. This adjustment is crucial because all SMILES CSV files are organized in correlation with the rows in the main data frame. To ensure a seamless merge, it is imperative to reorder the rows in the accurate sequence.

#### Reordering the Rows of the Main Data Frame ####

In [57]:
cmEMBL_right_order = ['CHEMBL2111481', 'CHEMBL2095184', 'CHEMBL3430894', 'CHEMBL3430895', 'CHEMBL3137278', 
                      'CHEMBL2095226', 'CHEMBL1907599' , 'CHEMBL3885597', 'CHEMBL3137268', 'CHEMBL3883284',
                      'CHEMBL2093869', 'CHEMBL3885596', 'CHEMBL2111461', 'CHEMBL2096675', 'CHEMBL3430893',
                      'CHEMBL2111416', 'CHEMBL2111425', 'CHEMBL4106150', 'CHEMBL2111362', 'CHEMBL2364172',
                      'CHEMBL2111443','CHEMBL3430891', 'CHEMBL2111407', 'CHEMBL1907598', 'CHEMBL2096661',
                      'CHEMBL4106121', 'CHEMBL3885595', 'CHEMBL4523628', 'CHEMBL3430892', 'CHEMBL4296069',
                      'CHEMBL3883325', 'CHEMBL4748218', 'CHEMBL3137286', 'CHEMBL3137287', 'CHEMBL3301388']

In [58]:
# Verify we got all of the 35 values

PRINT()
print(f'The number of unique chEMBL ids are 35, and we got -> {len(cmEMBL_right_order)}')
PRINT()

--------------------------------------------------------------------------------
The number of unique chEMBL ids are 35, and we got -> 35
--------------------------------------------------------------------------------


In [59]:
# Reorder the main data frame
df = df.loc[df['ChEMBL ID'].isin(cmEMBL_right_order)].sort_values(by=['ChEMBL ID'], key=lambda x: x.map(dict(zip(cmEMBL_right_order, range(len(cmEMBL_right_order))))))

# Resetting index after sorting
df = df.reset_index(drop=True)

In [60]:
df

Unnamed: 0,ChEMBL ID,Name,UniProt Accessions,Type,Organism,Compounds,Activities,Tax ID,Species Group Flag
0,CHEMBL2111481,Integrin alpha-4/beta-7,Q00651|P26011,PROTEIN COMPLEX,Mus musculus,22,55,10090,False
1,CHEMBL2095184,Integrin alpha-4/beta-7,P13612|P26010,PROTEIN COMPLEX,Homo sapiens,522,610,9606,False
2,CHEMBL3430894,Integrin alpha-IIb/beta-3,Q9QUM0|O54890,PROTEIN COMPLEX,Mus musculus,4,4,10090,False
3,CHEMBL3430895,Integrin alpha-10/Integrin beta-1,P05556|O75578,PROTEIN COMPLEX,Homo sapiens,1,1,9606,False
4,CHEMBL3137278,Integrin alpha1/beta1 complex,P05556|P56199,PROTEIN COMPLEX,Homo sapiens,6,6,9606,False
5,CHEMBL2095226,Integrin alpha-5/beta-1,P05556|P08648,PROTEIN COMPLEX,Homo sapiens,463,685,9606,False
6,CHEMBL1907599,Integrin alpha-4/beta-1,P05556|P13612,PROTEIN COMPLEX,Homo sapiens,1452,2269,9606,False
7,CHEMBL3885597,ITGB1-ITGA9 complex,P05556|Q13797,PROTEIN COMPLEX,Homo sapiens,10,10,9606,False
8,CHEMBL3137268,Integrin alpha2/beta1,P05556|P17301,PROTEIN COMPLEX,Homo sapiens,57,62,9606,False
9,CHEMBL3883284,Integrin alpha-3/beta-3,P05106|P26006,PROTEIN COMPLEX,Homo sapiens,25,47,9606,False


#### Continue the Merge Step ####

In [61]:
df.drop(df.columns.difference(['Name', 'UniProt Accessions', 'Organism','Compounds']), axis=1, inplace=True)

df.head(5)

Unnamed: 0,Name,UniProt Accessions,Organism,Compounds
0,Integrin alpha-4/beta-7,Q00651|P26011,Mus musculus,22
1,Integrin alpha-4/beta-7,P13612|P26010,Homo sapiens,522
2,Integrin alpha-IIb/beta-3,Q9QUM0|O54890,Mus musculus,4
3,Integrin alpha-10/Integrin beta-1,P05556|O75578,Homo sapiens,1
4,Integrin alpha1/beta1 complex,P05556|P56199,Homo sapiens,6


In [62]:
# Split the 'UniProt' column into separate columns based on '|'
uniprot_columns = df['UniProt Accessions'].str.split('|', expand=True)

# Rename the new columns
uniprot_columns.columns = [f'UniProt{i}' for i in range(1, uniprot_columns.shape[1] + 1)]
df = df.drop(columns=df.filter(like='UniProt').columns)

# Concatenate the new columns to the original DataFrame
df = pd.concat([df, uniprot_columns], axis=1)

df.head(10)

Unnamed: 0,Name,Organism,Compounds,UniProt1,UniProt2,UniProt3,UniProt4,UniProt5
0,Integrin alpha-4/beta-7,Mus musculus,22,Q00651,P26011,,,
1,Integrin alpha-4/beta-7,Homo sapiens,522,P13612,P26010,,,
2,Integrin alpha-IIb/beta-3,Mus musculus,4,Q9QUM0,O54890,,,
3,Integrin alpha-10/Integrin beta-1,Homo sapiens,1,P05556,O75578,,,
4,Integrin alpha1/beta1 complex,Homo sapiens,6,P05556,P56199,,,
5,Integrin alpha-5/beta-1,Homo sapiens,463,P05556,P08648,,,
6,Integrin alpha-4/beta-1,Homo sapiens,1452,P05556,P13612,,,
7,ITGB1-ITGA9 complex,Homo sapiens,10,P05556,Q13797,,,
8,Integrin alpha2/beta1,Homo sapiens,57,P05556,P17301,,,
9,Integrin alpha-3/beta-3,Homo sapiens,25,P05106,P26006,,,


In [64]:
df.tail(5)

Unnamed: 0,Name,Organism,Compounds,UniProt1,UniProt2,UniProt3,UniProt4,UniProt5
30,Integrin alpha-IIb/beta-3,Oryctolagus cuniculus,34,Q9TUN4,Q9TUN3,,,
31,VHL/Polycomb protein EED,Homo sapiens,2,O75530,P40337,,,
32,EZH2/SUZ12/EED complex,Homo sapiens,10,Q15910,O75530,Q15022,,
33,EZH1/SUZ12/EED/AEBP2/RBBP4 complex,Homo sapiens,29,Q92800,O75530,Q15022,Q6ZN18,Q09028
34,EZH2/SUZ12/EED/RBBP7/RBBP4,Homo sapiens,7,Q15910,O75530,Q15022,Q09028,Q16576
