In [31]:
import pandas as pd
import numpy as np
import os
import time

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler

# Load Dataset

In [4]:
os.getcwd()

'/home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/jfschulz_notebooks'

In [2]:
path = "/home/jana/code/jfschulz/project-brain-proteomics/raw_data"
file = "Glioma-clinic-TCGA-proteins.csv" 
df = pd.read_csv(f"{path}/{file}", header=0)
#df = pd.read_csv(path)
df.head()

Unnamed: 0,Case,years_to_birth,gender,histological_type,race,ethnicity,radiation_therapy,Grade,Mutation.Count,Percent.aneuploidy,...,p27_p,p27_pT157_p,p27_pT198_p,p38_pT180_Y182_p,p53_p,p62-LCK-ligand_p,p70S6K_p,p70S6K_pT389_p,p90RSK_p,p90RSK_pT359_S363_p
0,TCGA-CS-4938,31,female,astrocytoma,white,not hispanic or latino,no,G2,15,0.069412,...,-0.425127,-0.033398,0.289192,1.060163,-0.407456,-0.470354,-0.107559,-0.060441,-0.053104,-0.011132
1,TCGA-CS-6665,51,female,astrocytoma,white,not hispanic or latino,yes,G3,75,0.524814,...,0.076536,0.011809,-0.047973,-0.054275,-0.24402,0.106282,-0.034369,0.585072,0.43172,-0.201412
2,TCGA-CS-6666,22,male,astrocytoma,white,not hispanic or latino,yes,G3,18,0.403165,...,0.110268,0.066886,-0.06615,0.250434,0.432187,-0.210412,0.290949,-0.893383,-0.693677,-0.05525
3,TCGA-DB-5270,38,female,oligoastrocytoma,white,not hispanic or latino,no,G3,16,0.061382,...,-0.235321,0.015372,-0.127422,-1.190789,0.105396,0.218569,-0.099136,1.990618,0.166788,-0.23538
4,TCGA-DB-5273,33,male,astrocytoma,white,not hispanic or latino,yes,G3,16,0.017349,...,-0.343212,-0.250564,-0.234582,0.648598,-0.071851,-0.041811,-0.0993,-0.920359,-0.243159,-0.128841


In [43]:
#df.info(verbose=True, show_counts=True)
df["histological_type"].unique()

array(['astrocytoma', 'oligoastrocytoma', 'oligodendroglioma'],
      dtype=object)

# Preprocess

# Build Model

In [3]:
# Define target and features
cols_to_drop = ['Case', 'years_to_birth', 'gender', 'histological_type', 'race',
       'ethnicity', 'radiation_therapy', 'Grade', 'Mutation.Count',
       'Percent.aneuploidy', 'IDH.status', 'outcome']
X = df.drop(cols_to_drop, axis = 1)
y = df["outcome"]

In [4]:
X.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 174 columns):
 #    Column                     Non-Null Count  Dtype  
---   ------                     --------------  -----  
 0    14-3-3_beta_p              306 non-null    float64
 1    14-3-3_epsilon_p           306 non-null    float64
 2    14-3-3_zeta_p              306 non-null    float64
 3    4E-BP1_p                   306 non-null    float64
 4    4E-BP1_pS65_p              306 non-null    float64
 5    4E-BP1_pT37_T46_p          306 non-null    float64
 6    4E-BP1_pT70_p              306 non-null    float64
 7    53BP1_p                    306 non-null    float64
 8    A-Raf_pS299_p              306 non-null    float64
 9    ACC1_p                     306 non-null    float64
 10   ACC_pS79_p                 306 non-null    float64
 11   ACVRL1_p                   306 non-null    float64
 12   AMPK_alpha_p               306 non-null    float64
 13   AMPK_pT172_p               306 no

In [69]:
# Create train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# save X_test data set to use for API testing:
path = "/home/jana/code/jfschulz/project-brain-proteomics/raw_data"
file = "Glioma-clinic-TCGA-proteins-test-with-identifier.csv" 

X_test.to_csv(path_or_buf=f"{path}/{file}", header=True, index=True, index_label="Identifier")

In [79]:
tmp = pd.read_csv(f"{path}/{file}", header=0)
tmp.head(3)

Unnamed: 0,Identifier,14-3-3_beta_p,14-3-3_epsilon_p,14-3-3_zeta_p,4E-BP1_p,4E-BP1_pS65_p,4E-BP1_pT37_T46_p,4E-BP1_pT70_p,53BP1_p,A-Raf_pS299_p,...,p27_p,p27_pT157_p,p27_pT198_p,p38_pT180_Y182_p,p53_p,p62-LCK-ligand_p,p70S6K_p,p70S6K_pT389_p,p90RSK_p,p90RSK_pT359_S363_p
0,32,-0.275971,0.081714,0.069275,0.15531,-0.160804,0.904401,-0.118147,0.077331,-0.142488,...,-0.042561,-0.068607,0.067322,0.375667,0.217139,-0.146036,0.09653,-0.510186,0.177621,0.131869
1,232,0.132971,0.052626,-0.398325,0.212736,-0.015099,0.280765,0.13543,0.050657,0.098807,...,0.196228,0.001162,-0.089271,-0.023013,-0.114137,-0.176301,-0.047656,0.402459,-0.040872,0.117412
2,62,-0.136615,-0.034012,0.367918,-0.258728,0.39832,0.46107,-0.126436,0.013747,0.011313,...,-0.390495,-0.138529,-0.224501,-0.118712,-0.082139,-0.128102,-0.078334,1.549336,-0.076482,-0.106585


In [87]:
if 'Identifier' in tmp.columns:
    X_pred = tmp.drop(["Identifier"], axis = 1)
else:
    X_pred = tmp

In [88]:
X_pred.head(3)

Unnamed: 0,14-3-3_beta_p,14-3-3_epsilon_p,14-3-3_zeta_p,4E-BP1_p,4E-BP1_pS65_p,4E-BP1_pT37_T46_p,4E-BP1_pT70_p,53BP1_p,A-Raf_pS299_p,ACC1_p,...,p27_p,p27_pT157_p,p27_pT198_p,p38_pT180_Y182_p,p53_p,p62-LCK-ligand_p,p70S6K_p,p70S6K_pT389_p,p90RSK_p,p90RSK_pT359_S363_p
0,-0.275971,0.081714,0.069275,0.15531,-0.160804,0.904401,-0.118147,0.077331,-0.142488,0.074746,...,-0.042561,-0.068607,0.067322,0.375667,0.217139,-0.146036,0.09653,-0.510186,0.177621,0.131869
1,0.132971,0.052626,-0.398325,0.212736,-0.015099,0.280765,0.13543,0.050657,0.098807,0.142678,...,0.196228,0.001162,-0.089271,-0.023013,-0.114137,-0.176301,-0.047656,0.402459,-0.040872,0.117412
2,-0.136615,-0.034012,0.367918,-0.258728,0.39832,0.46107,-0.126436,0.013747,0.011313,0.189464,...,-0.390495,-0.138529,-0.224501,-0.118712,-0.082139,-0.128102,-0.078334,1.549336,-0.076482,-0.106585


In [94]:
# Scale
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train)
X_train_proc = min_max_scaler.transform(X_train)
X_test_proc = min_max_scaler.transform(X_test)


In [None]:
# Initialize model
model = SGDClassifier(loss = "log_loss")

# Cross validte model performance
np.mean(cross_val_score(model, X_train_proc, y_train, cv=5))

# Train model
history = model.fit(X_train_proc, y_train)

## Functions to save and load model

In [55]:
# Save model

from joblib import dump, load
import time


def save_model(model_to_save = None, 
               model_type = None, 
               path_to_save = "/home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_models"
              ):
    """
    Persist trained model locally on the hard drive at f"{path_to_save/model_type/f"{timestamp}.joblib"
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    
    # Save model locally
    model_path_file = os.path.join(f"{path_to_save}/{model_type}_{timestamp}.joblib")
   
    dump(model, model_path_file) 

    print(f"✅ Model saved locally at {model_path_file}")

In [56]:
save_model(model_to_save = model, model_type = "SGDclassifier")

✅ Model saved locally at /home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_models/SGDclassifier_20240305-135742.joblib


In [59]:
# Load model
def load_model(path = '/home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_models',
               file = 'SGDclassifier_20240305-135742.joblib'
              ):
    path_file = f"{path}/{file}"

    model = load(path_file) 
    return model

In [60]:
model3 = load_model()
model3

'/home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_models/SGDclassifier_20240305-121210.joblib'

In [22]:
# Predict test data
outcome = pd.DataFrame(model.predict(X_test_proc), columns=["Outcome"])
prob = pd.DataFrame(model.predict_proba(X_test_proc), columns=["Probability_0", "Probability_1"])

# Merge results into dataframe
result = pd.merge(prob,outcome, left_index=True, right_index=True)

'/home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_models/SGDclassifier/_20240305-121210.h5'

# Create prediction endpoint for API

In [95]:
def predict(
            path: str, #"/home/jana/code/jfschulz/project-brain-proteomics/raw_data"
            file: str, # "Glioma-clinic-TCGA-proteins-test-with-identifier.csv"
            ):
    """
    Make a prediction for every row in your dataset.
    Input needs to be a csv file with rows = samples and columns = proteins.
    The first row has to contain the protein names or any other identifier that will serve as a header.
    If your file has a sample identifier column, name this column "Identifier"
    """

    df = pd.read_csv(f"{path}/{file}", header=0)

    if 'Identifier' in df.columns:
        X_pred = df.drop(["Identifier"], axis = 1)
    else:
        X_pred = df

    model = load_model()

    X_pred_proc = min_max_scaler.transform(X_pred)
    
    # Predict data
    outcome = pd.DataFrame(model.predict(X_pred_proc), columns=["Outcome"])
    prob = pd.DataFrame(model.predict_proba(X_pred_proc), columns=["Probability_0", "Probability_1"])

    # Merge results into dataframe
    result = pd.merge(prob,outcome, left_index=True, right_index=True)

    return result

In [96]:
result = predict(path = "/home/jana/code/jfschulz/project-brain-proteomics/raw_data",
       file = "Glioma-clinic-TCGA-proteins-test-with-identifier.csv")

In [98]:
result.head(62)

Unnamed: 0,Probability_0,Probability_1,Outcome
0,6.844969e-10,1.000000,1
1,7.141553e-02,0.928584,1
2,0.000000e+00,1.000000,1
3,0.000000e+00,1.000000,1
4,1.132427e-14,1.000000,1
...,...,...,...
57,2.220446e-16,1.000000,1
58,0.000000e+00,1.000000,1
59,6.661338e-16,1.000000,1
60,1.289211e-10,1.000000,1
