In [5]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
from joblib import parallel_backend

# Load your dataset into a DataFrame
df = pd.read_csv('ML-PROJECT.csv')

# Check for any NaN or infinite values in the dataset
print(df.isna().sum())
print(df.describe())

# Reduce the data size for testing
df_sample = df.sample(frac=0.1, random_state=42)

# Define your target column
target_column = 'sst8'

# Extract features (X) and target (y) from your dataset
X = df_sample.drop(target_column, axis=1)
y = df_sample[target_column]

# Define numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Select the classifier and set n_jobs=2 to use 2 cores
classifier = RandomForestClassifier(random_state=42, n_jobs=2)

# Define the pipeline including the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline with parallel processing enabled
with parallel_backend('threading'):
    pipeline.fit(X_train, y_train)

# Save the trained model using joblib
joblib.dump(pipeline, 'sst8_predictor_model Juw.joblib')

pdb_id              0
chain_code          0
seq                 0
sst8                0
sst3                0
len                 0
has_nonstd_aa    8994
Exptl.              0
resolution          0
R-factor            0
FreeRvalue          0
dtype: int64
               len  has_nonstd_aa   resolution     R-factor   FreeRvalue
count  9078.000000           84.0  9078.000000  9078.000000  9078.000000
mean    243.168539            1.0     1.614917     0.176362     0.213723
std     156.566596            0.0     0.281776     0.029329     0.081340
min      20.000000            1.0     0.480000     0.070000     0.080000
25%     132.000000            1.0     1.440000     0.160000     0.180000
50%     208.000000            1.0     1.650000     0.180000     0.210000
75%     323.000000            1.0     1.850000     0.200000     0.230000
max    1632.000000            1.0     2.000000     0.250000     1.000000


['sst8_predictor_model Juw.joblib']

In [2]:
pip install gradio

Collecting gradio
  Obtaining dependency information for gradio from https://files.pythonhosted.org/packages/22/4f/a7fb0dbb7d7ae3e8669b2cdb69beadcd0bf769f7c0cdb5f3485bce66ea54/gradio-4.36.0-py3-none-any.whl.metadata
  Using cached gradio-4.36.0-py3-none-any.whl.metadata (15 kB)
Collecting altair<6.0,>=4.2.0 (from gradio)
  Obtaining dependency information for altair<6.0,>=4.2.0 from https://files.pythonhosted.org/packages/46/30/2118537233fa72c1d91a81f5908a7e843a6601ccc68b76838ebc4951505f/altair-5.3.0-py3-none-any.whl.metadata
  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting fastapi (from gradio)
  Obtaining dependency information for fastapi from https://files.pythonhosted.org/packages/e6/33/de41e554e5a187d583906e10d53bfae5fd6c07e98cbf4fe5262bd37e739a/fastapi-0.111.0-py3-none-any.whl.metadata
  Using cached fastapi-0.111.0-py3-none-any.whl.metadata (25 kB)
Collecting gradio-client==1.0.1 (from gradio)
  Obtaining dependency information for gradio-client==1.0.1 

In [6]:

import gradio as gr
import joblib
import pandas as pd

# Load the trained model
model = joblib.load('sst8_predictor_model.joblib')

# Define the prediction function
def predict_sst8(pdb_id, chain_code, seq, sst3, length, has_nonstd_aa, exptl, resolution, r_factor, free_rvalue):
    data = {
        'pdb_id': [pdb_id],
        'chain_code': [chain_code],
        'seq': [seq],
        'sst3': [sst3],
        'len': [length],
        'has_nonstd_aa': [has_nonstd_aa],
        'Exptl.': [exptl],
        'resolution': [resolution],
        'R-factor': [r_factor],
        'FreeRvalue': [free_rvalue]
    }
    df = pd.DataFrame(data)
    prediction = model.predict(df)
    return prediction[0]

# Create the Gradio interface
iface = gr.Interface(
    fn=predict_sst8,
    inputs=[
        gr.Textbox(label="PDB ID"),
        gr.Textbox(label="Chain Code"),
        gr.Textbox(label="Sequence"),
        gr.Textbox(label="SST3"),
        gr.Number(label="Length"),
        gr.Number(label="Has Non-Standard AA"),
        gr.Textbox(label="Experimental"),
        gr.Number(label="Resolution"),
        gr.Number(label="R-Factor"),
        gr.Number(label="Free R-Value")
    ],
    outputs="text",
    title="SST8 Predictor",
    description="Predict the SST8 value based on the input features."
)

# Launch the interface
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


