**HuRI database conversion**

Convert the protein names from Ensembl to UniProt

In [None]:
import pandas as pd

# Step 1: Load Data into DataFrames
# Assuming your TSV file is 'ensembl_data.tsv' and Excel sheet is 'uniprot_data.xlsx'

# Load TSV file
tsv_file = '/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/raw_data/H-I-05.tsv'
tsv_df = pd.read_csv(tsv_file, sep='\t', header=None, names=['Protein1', 'Protein2'])

# Load Excel sheet
excel_file = '/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/raw_data/idmapping_reviewed_true_2024_06_23.xlsx'
excel_df = pd.read_excel(excel_file, sheet_name='Sheet0')  # Adjust sheet name as needed
result_dict = excel_df.set_index('From')['Entry'].to_dict()

print(tsv_df.head(), tsv_df.shape)
print(excel_df.head(), excel_df.shape)

num_entries_to_print = 5
print(f"First {num_entries_to_print} entries of the dictionary:")
count = 0
for key, value in result_dict.items():
    print(f"{key}: {value}")
    count += 1
    if count >= num_entries_to_print:
        break

          Protein1         Protein2
0  ENSG00000001167  ENSG00000062725
1  ENSG00000002330  ENSG00000175793
2  ENSG00000002330  ENSG00000182944
3  ENSG00000002586  ENSG00000135018
4  ENSG00000002822  ENSG00000002822 (2709, 2)
              From   Entry   Entry Name               Gene Names
0  ENSG00000105793  A4D1E9  GTPBA_HUMAN  GTPBP10 OBGH2 UG0751c10
1  ENSG00000125457  A9UHW6  MI4GD_HUMAN             MIF4GD SLIP1
2  ENSG00000188763  O00144   FZD9_HUMAN                FZD9 FZD3
3  ENSG00000123136  O00148  DX39A_HUMAN             DDX39A DDX39
4  ENSG00000107438  O00151  PDLI1_HUMAN       PDLIM1 CLIM1 CLP36 (1512, 4)
First 5 entries of the dictionary:
ENSG00000105793: A4D1E9
ENSG00000125457: A9UHW6
ENSG00000188763: O00144
ENSG00000123136: O00148
ENSG00000107438: O00151


In [None]:
# Add new columns mapped from dictionary
tsv_df['protein1_uniprot'] = tsv_df['Protein1'].map(result_dict)
tsv_df['protein2_uniprot'] = tsv_df['Protein2'].map(result_dict)
tsv_df['interaction_type'] = 1 #Positive interaction

print(tsv_df.head(), tsv_df.shape)

# Check for NaN values
# Count NaN values per column
nan_count_per_column = tsv_df.isna().sum()

print("\nNaN Count per Column:")
print(nan_count_per_column)

# Total count of NaN values in the DataFrame
total_nan_count = tsv_df.isna().sum().sum()
print(f"\nTotal NaN Count: {total_nan_count}")

          Protein1         Protein2 protein1_uniprot protein2_uniprot  \
0  ENSG00000001167  ENSG00000062725           P23511           Q92624   
1  ENSG00000002330  ENSG00000175793           Q92934           P31947   
2  ENSG00000002330  ENSG00000182944           Q92934           Q01844   
3  ENSG00000002586  ENSG00000135018           P14209           Q9UMX0   
4  ENSG00000002822  ENSG00000002822           Q9Y6D9           Q9Y6D9   

   interaction_type  
0                 1  
1                 1  
2                 1  
3                 1  
4                 1   (2709, 5)

NaN Count per Column:
Protein1             0
Protein2             0
protein1_uniprot    10
protein2_uniprot    31
interaction_type     0
dtype: int64

Total NaN Count: 41


In [None]:
tsv_df1 = tsv_df.dropna()
print(tsv_df1.head(), tsv_df1.shape)

# Check for NaN values
# Count NaN values per column
nan_count_per_column = tsv_df1.isna().sum()

print("\nNaN Count per Column:")
print(nan_count_per_column)

# Total count of NaN values in the DataFrame
total_nan_count = tsv_df1.isna().sum().sum()
print(f"\nTotal NaN Count: {total_nan_count}")

          Protein1         Protein2 protein1_uniprot protein2_uniprot  \
0  ENSG00000001167  ENSG00000062725           P23511           Q92624   
1  ENSG00000002330  ENSG00000175793           Q92934           P31947   
2  ENSG00000002330  ENSG00000182944           Q92934           Q01844   
3  ENSG00000002586  ENSG00000135018           P14209           Q9UMX0   
4  ENSG00000002822  ENSG00000002822           Q9Y6D9           Q9Y6D9   

   interaction_type  
0                 1  
1                 1  
2                 1  
3                 1  
4                 1   (2670, 5)

NaN Count per Column:
Protein1            0
Protein2            0
protein1_uniprot    0
protein2_uniprot    0
interaction_type    0
dtype: int64

Total NaN Count: 0


In [None]:
tsv_df1 = tsv_df1[['protein1_uniprot', 'protein2_uniprot', 'interaction_type']]
print(tsv_df1.head(), tsv_df1.shape)
tsv_df1.to_csv('/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/processed_csv/HuRI.csv', index=False)

  protein1_uniprot protein2_uniprot  interaction_type
0           P23511           Q92624                 1
1           Q92934           P31947                 1
2           Q92934           Q01844                 1
3           P14209           Q9UMX0                 1
4           Q9Y6D9           Q9Y6D9                 1 (2670, 3)


**Negatome data**

As the data is in UniProt form, no conversion needed

In [None]:
# Load Negatome TXT file
negatome_file = '/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/raw_data/pdb_stringent.txt'
negatome_data = pd.read_csv(negatome_file, sep='\t')
negatome_data = negatome_data.rename(columns={'#ProteinA': 'protein1_uniprot', 'ProteinB': 'protein2_uniprot'})
negatome_data['interaction_type'] = 0 #Negative interaction

#negatome_data = negatome_data[['protein1_uniprot', 'protein2_uniprot', 'interaction_type', 'PDB_Code']]
negatome_data = negatome_data[['protein1_uniprot', 'protein2_uniprot', 'interaction_type']]

# Inspect the first few rows
print("Negatome Data:")
print(negatome_data.head(), negatome_data.shape)

negatome_data.to_csv('/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/processed_csv/Negatome.csv', index=False)

Negatome Data:
  protein1_uniprot protein2_uniprot  interaction_type
0           A0A5B9           P01887                 0
1           A0A5B9           P61769                 0
2           A0N6Y3           P00127                 0
3           A0N6Y3           P00128                 0
4           A0N6Y3           P00163                 0 (4161, 3)


**Combine the two files**

In [6]:
import pandas as pd

# File paths for your CSV files
file1_path = '/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/processed_csv/Negatome.csv'
file2_path = '/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/processed_csv/HuRI.csv'

# Read CSV files into DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Concatenate DataFrames vertically (one below the other)
combined_df = pd.concat([df1, df2], ignore_index=True)

# Optional: Print the combined DataFrame
print(combined_df)

# Optional: Write the combined DataFrame to a new CSV file
combined_csv_path = '/content/drive/MyDrive/Colab_Notebooks/GNN/GraphsClassification - PPIGraphNet/data/processed_csv/interaction_db.csv'
combined_df.to_csv(combined_csv_path, index=False)

# Optionally, if you want to print or inspect the first few rows of the combined DataFrame
print(combined_df.head())

     protein1_uniprot protein2_uniprot  interaction_type
0              A0A5B9           P01887                 0
1              A0A5B9           P61769                 0
2              A0N6Y3           P00127                 0
3              A0N6Y3           P00128                 0
4              A0N6Y3           P00163                 0
...               ...              ...               ...
6826           Q03154           Q03154                 1
6827           O75340           O75340                 1
6828           O95073           O95073                 1
6829           O95073           Q96JP2                 1
6830           O95073           Q5U045                 1

[6831 rows x 3 columns]
  protein1_uniprot protein2_uniprot  interaction_type
0           A0A5B9           P01887                 0
1           A0A5B9           P61769                 0
2           A0N6Y3           P00127                 0
3           A0N6Y3           P00128                 0
4           A0N6Y3   