## The purpose of this script is to deploy the models to Gradio. The two models used are the best deep learning model found from the Grid Search CV and the best Gradient Boosting model from the Randomized Search CV.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# load in the models
from joblib import load

DL_model = load("/content/drive/MyDrive/ML_FinalProject/deep_learning_model.pkl")

xgb_model = load("/content/drive/MyDrive/ML_FinalProject/best_xgb_model.pkl")



In [None]:
!pip install gradio --quiet

In [22]:
import gradio as gr
import numpy as np

# Define a mapping for one-hot encoding nucleotides (using same mapping scheme as before)
NUCLEOTIDE_MAP = {
    "A": [1, 0, 0, 0],
    "T": [0, 1, 0, 0],
    "G": [0, 0, 1, 0],
    "C": [0, 0, 0, 1]
}

def one_hot_encode_sequence(seq, target_length=1000):
    """
    One-hot encode a DNA sequence and ensure it matches the target length.
    - seq: String containing the nucleotide sequence (e.g., "AATTCG").
    - target_length: Length to pad or truncate the sequence to (default: 1000).
    Returns:
        - A NumPy array of shape (target_length, 4) representing the one-hot encoding.
    """
    one_hot_encoded = np.array([NUCLEOTIDE_MAP.get(nucleotide, [0, 0, 0, 0]) for nucleotide in seq])

    # Pad or truncate the sequence
    if len(one_hot_encoded) < target_length:
        # Pad with zeros to reach target length
        padding = np.zeros((target_length - len(one_hot_encoded), 4))
        one_hot_encoded = np.vstack([one_hot_encoded, padding])
    elif len(one_hot_encoded) > target_length:
        # Truncate to the target length
        one_hot_encoded = one_hot_encoded[:target_length]

    return one_hot_encoded

# One-hot encode tissue input
def one_hot_encode_tissue(tissue, num_classes=4):
    tissue_id = int(tissue)
    if tissue_id >= num_classes:
        raise ValueError(f"Tissue ID {tissue_id} is out of range for {num_classes} classes.")
    one_hot_encoded = np.zeros((1, num_classes))
    one_hot_encoded[0, tissue_id] = 1
    return one_hot_encoded

def preprocess_inputs(seq, tissue):
    """
    Preprocess inputs for both models.
    - seq: Sequence input as a regular string
    - tissue: Tissue input as an integer string
    """
    seq_array = one_hot_encode_sequence(seq).reshape(1, -1, 4)  # Reshape sequence to (1, length, 4)
    tissue_array = one_hot_encode_tissue(tissue)  # One-hot encode tissue ID to (0, 3)
    return seq_array, tissue_array

# Prediction functions
def predict_with_deep_model(seq, tissue):
    """
    Predict using the deep learning model.
    - seq: Sequence input as a regular string.
    - tissue: Tissue input as an integer string.
    """
    seq_array, tissue_array = preprocess_inputs(seq, tissue)
    prediction = DL_model.predict([seq_array, tissue_array])
    return float(prediction[0][0])

def predict_with_xgb(seq, tissue):
    """
    Predict using the XGB model.
    - seq: Sequence input as a regular string.
    - tissue: Tissue input as an integer string.
    """
    seq_array, tissue_array = preprocess_inputs(seq, tissue)
    combined_features = np.hstack([seq_array.flatten(), tissue_array.flatten()])
    prediction = xgb_model.predict([combined_features])

    return prediction[0]

# Gradio interface
def gradio_interface(seq, tissue):
    """
    Combine predictions from both models and display results.
    - seq: Regular DNA sequence string.
    - tissue: Integer tissue identifier.
    """
    deep_pred = predict_with_deep_model(seq, tissue)
    xgb_pred = predict_with_xgb(seq, tissue)
    return {
        "Deep Learning Model Prediction": deep_pred,
        "XGBoost Model Prediction": xgb_pred,
    }

# Define Gradio inputs
seq_input = gr.Textbox(label="Sequence Input (Regular DNA sequence)")
tissue_input = gr.Textbox(label="Tissue Input (Integer 0-3)")

# Define Gradio output
output = gr.JSON(label="Model Predictions")

# Launch Gradio app
gr.Interface(
    fn=gradio_interface,
    inputs=[seq_input, tissue_input],
    outputs=output,
    title="Gene Expression Prediction",
    description="Enter a DNA sequence and a tissue identifier to get predictions from both the Deep Learning model and the Gradient Boosting model."
).launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://54212779f63c139cda.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7864 <> https://54212779f63c139cda.gradio.live


