<a href="https://colab.research.google.com/github/JiHoonPark96/practice/blob/main/PPI_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install TDC library for accessing bioinformatics datasets
!pip install PyTDC

# Install TensorFlow for building and training the deep learning model
!pip install tensorflow

# Install Scikit-learn for evaluation metrics and other utilities
!pip install scikit-learn

# Install Matplotlib for plotting and visualizing the results
!pip install matplotlib


Collecting PyTDC
  Downloading pytdc-1.1.1.tar.gz (146 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/146.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.8/146.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate==0.33.0 (from PyTDC)
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting biopython<2.0,>=1.78 (from PyTDC)
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting dataclasses<1.0,>=0.6 (from PyTDC)
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting datasets==2.20.0 (from PyTDC)
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.2 (from PyTDC)
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting fuzzywuzzy<1.0,>=0.18.0 (from PyTDC)
  Downloading fuzz



In [2]:
# Import the necessary libraries
from tdc.multi_pred import PPI

# Load the HuRI dataset using TDC library
data = PPI(name='HuRI')

# Generate negative samples (equal number of negative samples as positive samples)
data = data.neg_sample(frac=1)

# Split the dataset into train, validation, and test sets
train_data = data.get_split()['train']
valid_data = data.get_split()['valid']
test_data = data.get_split()['test']

# Check the distribution of negative (0) and positive (1) samples in the training set
print(train_data['Y'].value_counts())


Downloading...
100%|██████████| 139M/139M [00:07<00:00, 18.4MiB/s]
Loading...
Done!


Y
0    36758
1    36558
Name: count, dtype: int64


In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the Tokenizer for converting protein sequences to numerical sequences
tokenizer = Tokenizer(char_level=True)  # Treating each amino acid as a character
tokenizer.fit_on_texts(train_data['Protein1'].tolist() + train_data['Protein2'].tolist())

# Convert the protein sequences into numerical sequences for training data
X1_train = tokenizer.texts_to_sequences(train_data['Protein1'].tolist())
X2_train = tokenizer.texts_to_sequences(train_data['Protein2'].tolist())

# Convert the protein sequences into numerical sequences for validation data
X1_valid = tokenizer.texts_to_sequences(valid_data['Protein1'].tolist())
X2_valid = tokenizer.texts_to_sequences(valid_data['Protein2'].tolist())

# Convert the protein sequences into numerical sequences for test data
X1_test = tokenizer.texts_to_sequences(test_data['Protein1'].tolist())
X2_test = tokenizer.texts_to_sequences(test_data['Protein2'].tolist())

# Pad the sequences to make sure they all have the same length
max_length = 100  # Setting an arbitrary max length, adjust based on your dataset
X1_train = pad_sequences(X1_train, maxlen=max_length, padding='post')
X2_train = pad_sequences(X2_train, maxlen=max_length, padding='post')

X1_valid = pad_sequences(X1_valid, maxlen=max_length, padding='post')
X2_valid = pad_sequences(X2_valid, maxlen=max_length, padding='post')

X1_test = pad_sequences(X1_test, maxlen=max_length, padding='post')
X2_test = pad_sequences(X2_test, maxlen=max_length, padding='post')

# Combine the two protein sequences into one input
import numpy as np
X_train = np.concatenate((X1_train, X2_train), axis=1)
X_valid = np.concatenate((X1_valid, X2_valid), axis=1)
X_test = np.concatenate((X1_test, X2_test), axis=1)

# Extract the target labels (interaction: 1, no interaction: 0)
y_train = train_data['Y']
y_valid = valid_data['Y']
y_test = test_data['Y']


NameError: name 'train_data' is not defined

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Build the neural network model
vocab_size = len(tokenizer.word_index) + 1  # Total number of unique amino acids

model = Sequential()
# Embedding layer for protein sequences
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=2*max_length))  # Concatenated length of two protein sequences
model.add(Flatten())  # Flatten the output of the embedding layer
model.add(Dense(128, activation='relu'))  # Hidden layer
model.add(Dense(64, activation='relu'))   # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification (interaction or no interaction)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the training data
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=32)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


In [None]:
import matplotlib.pyplot as plt

# Plot training and validation accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot training and validation loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
