In [31]:
import pandas as pd
import numpy as np
import os
import time

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler

# Load Dataset

In [2]:
os.getcwd()

'/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/jfschulz_notebooks'

In [32]:
path = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data"
file = "Glioma-clinic-TCGA-proteins.csv" 
df = pd.read_csv(f"{path}/{file}", header=0)
#df = pd.read_csv(path)
df.head(2)

Unnamed: 0,Case,years_to_birth,gender,histological_type,race,ethnicity,radiation_therapy,Grade,Mutation.Count,Percent.aneuploidy,...,p27_p,p27_pT157_p,p27_pT198_p,p38_pT180_Y182_p,p53_p,p62-LCK-ligand_p,p70S6K_p,p70S6K_pT389_p,p90RSK_p,p90RSK_pT359_S363_p
0,TCGA-CS-4938,31,female,astrocytoma,white,not hispanic or latino,no,G2,15,0.069412,...,-0.425127,-0.033398,0.289192,1.060163,-0.407456,-0.470354,-0.107559,-0.060441,-0.053104,-0.011132
1,TCGA-CS-6665,51,female,astrocytoma,white,not hispanic or latino,yes,G3,75,0.524814,...,0.076536,0.011809,-0.047973,-0.054275,-0.24402,0.106282,-0.034369,0.585072,0.43172,-0.201412


In [4]:
#df.info(verbose=True, show_counts=True)
df["histological_type"].unique()

array(['astrocytoma', 'oligoastrocytoma', 'oligodendroglioma'],
      dtype=object)

# Define Features and Target

In [5]:
# Define target and features
cols_to_drop = ['Case', 'years_to_birth', 'gender', 'histological_type', 'race',
       'ethnicity', 'radiation_therapy', 'Grade', 'Mutation.Count',
       'Percent.aneuploidy', 'IDH.status', 'outcome']
X = df.drop(cols_to_drop, axis = 1)
y = df["outcome"]

X.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 174 columns):
 #    Column                     Non-Null Count  Dtype  
---   ------                     --------------  -----  
 0    14-3-3_beta_p              306 non-null    float64
 1    14-3-3_epsilon_p           306 non-null    float64
 2    14-3-3_zeta_p              306 non-null    float64
 3    4E-BP1_p                   306 non-null    float64
 4    4E-BP1_pS65_p              306 non-null    float64
 5    4E-BP1_pT37_T46_p          306 non-null    float64
 6    4E-BP1_pT70_p              306 non-null    float64
 7    53BP1_p                    306 non-null    float64
 8    A-Raf_pS299_p              306 non-null    float64
 9    ACC1_p                     306 non-null    float64
 10   ACC_pS79_p                 306 non-null    float64
 11   ACVRL1_p                   306 non-null    float64
 12   AMPK_alpha_p               306 non-null    float64
 13   AMPK_pT172_p               306 no

In [6]:
# Create train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# save X_test data set to use for API testing:
#path = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data"
#file = "Glioma-clinic-TCGA-proteins-test-with-identifier.csv" 

#X_test.to_csv(path_or_buf=f"{path}/{file}", header=True, index=True, index_label="Identifier")

In [37]:
# save one row of df data set to use for API testing: row 85 --> outcome = 1
path = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data"
file = "Glioma-clinic-TCGA-proteins-test-with-identifier-outcome1.csv" 
cols_to_drop = ['Case', 'years_to_birth', 'gender', 'histological_type', 'race',
       'ethnicity', 'radiation_therapy', 'Grade', 'Mutation.Count',
       'Percent.aneuploidy', 'IDH.status', 'outcome']

tmp1 = df[85:86].drop(cols_to_drop, axis = 1)
tmp1.to_csv(path_or_buf=f"{path}/{file}", header=True, index=True, index_label="Identifier")

In [41]:
# save one row of df data set to use for API testing: row 274 --> outcome = 0
path = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data"
file = "Glioma-clinic-TCGA-proteins-test-with-identifier-outcome0.csv" 
cols_to_drop = ['Case', 'years_to_birth', 'gender', 'histological_type', 'race',
       'ethnicity', 'radiation_therapy', 'Grade', 'Mutation.Count',
       'Percent.aneuploidy', 'IDH.status', 'outcome']

tmp1 = df[274:275].drop(cols_to_drop, axis = 1)
tmp1.to_csv(path_or_buf=f"{path}/{file}", header=True, index=True, index_label="Identifier")

In [40]:
df[274:275]["outcome"]

274    0
Name: outcome, dtype: int64

## Code to create feature table based on uploaded csv file for api

In [7]:
# Code to create 
tmp = pd.read_csv(f"{path}/{file}", header=0)
tmp.head(3)

if 'Identifier' in tmp.columns:
    X_pred = tmp.drop(["Identifier"], axis = 1)
else:
    X_pred = tmp

# Preprocess

In [8]:
# Scale
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train)
X_train_proc = min_max_scaler.transform(X_train)
X_test_proc = min_max_scaler.transform(X_test)

## Functions to save and load scaler for api

In [9]:
# Save scaler

from joblib import dump, load
import time


def save_scaler(scaler_to_save = None, 
               scaler_type = None, 
               path_to_save = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/saved_scalers"
              ):
    """
    Persist trained model locally on the hard drive at f"{path_to_save/scaler_type/f"{timestamp}.joblib"
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    
    # Save scaler locally
    scaler_path_file = os.path.join(f"{path_to_save}/{scaler_type}_{timestamp}.joblib")
   
    dump(scaler_to_save, scaler_path_file) 

    print(f"✅ Scaler saved locally at {scaler_path_file}")
    
# Load scaler
def load_scaler(path = '/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/saved_scalers',
               file = 'MinMax_20240306-102844.joblib'
              ):
    path_file = f"{path}/{file}"

    scaler = load(path_file) 
    return scaler

In [14]:
save_scaler(scaler_to_save= min_max_scaler,
           scaler_type="MinMax"
           )

✅ Scaler saved locally at /home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_scalers/MinMax_20240306-102844.joblib


In [15]:
load_scaler(file ="MinMax_20240306-102844.joblib")

# Build Model

In [10]:
# Initialize model
model = SGDClassifier(loss = "log_loss")

# Cross validte model performance
np.mean(cross_val_score(model, X_train_proc, y_train, cv=5))

# Train model
history = model.fit(X_train_proc, y_train)

## Functions to save and load model

In [11]:
# Save model

from joblib import dump, load
import time


def save_model(model_to_save = None, 
               model_type = None, 
               path_to_save = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/saved_models"
              ):
    """
    Persist trained model locally on the hard drive at f"{path_to_save/model_type/f"{timestamp}.joblib"
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    
    # Save model locally
    model_path_file = os.path.join(f"{path_to_save}/{model_type}_{timestamp}.joblib")
   
    dump(model_to_save, model_path_file) 

    print(f"✅ Model saved locally at {model_path_file}")

In [24]:
save_model(model_to_save = model, model_type = "SGDclassifier")

✅ Model saved locally at /home/jana/code/jfschulz/project-brain-proteomics/brain_proteomics/api/saved_models/SGDclassifier_20240305-154413.joblib


In [12]:
# Load model
def load_model(path = '/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/saved_models',
               file = 'SGDclassifier_20240305-135742.joblib'
              ):
    path_file = f"{path}/{file}"

    model = load(path_file) 
    return model

In [113]:
model3 = load_model()
scaler = load_scaler()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [114]:
X_test.shape
X_test_proc = scaler.transform(X_test)

In [115]:
X_test_proc.shape

(62, 174)

In [117]:
model3.predict(X_test_proc).shape

(62,)

In [119]:
pd.DataFrame(model.predict(X_test_proc), columns=["Outcome"], dtype = int)

Unnamed: 0,Outcome
0,1
1,1
2,1
3,1
4,1
...,...
57,1
58,1
59,1
60,1


In [120]:
pd.DataFrame(model.predict_proba(X_test_proc), columns=["Probability_0", "Probability_1"], dtype = float)


Unnamed: 0,Probability_0,Probability_1
0,1.558753e-13,1.000000
1,3.374888e-09,1.000000
2,0.000000e+00,1.000000
3,0.000000e+00,1.000000
4,0.000000e+00,1.000000
...,...,...
57,1.428438e-06,0.999999
58,0.000000e+00,1.000000
59,6.966472e-06,0.999993
60,4.025669e-13,1.000000


In [125]:
outcome = pd.DataFrame(model.predict(X_test_proc), columns=["Outcome"], dtype = int)
prob = pd.DataFrame(model.predict_proba(X_test_proc), columns=["Probability_0", "Probability_1"], dtype = float)

    # Merge results into dataframe
result = pd.merge(prob,outcome, left_index=True, right_index=True)


In [141]:
{
    "Outcome": result["Outcome"].tolist,
    "Probability_0": result["Probability_0"].tolist()
    "Probability_1": result["Probability_1"].tolist()
}

{'Probability_0': [1.5587531265737198e-13,
  3.374887702634055e-09,
  0.0,
  0.0,
  0.0,
  8.881784197001252e-16,
  0.0,
  0.0,
  0.0,
  0.0,
  6.689112819202592e-09,
  0.0,
  4.341000806040718e-06,
  5.825695481576076e-11,
  0.0002600417144659195,
  0.0,
  4.5078696331302126e-10,
  0.020643838388880953,
  0.9999999941233826,
  0.0,
  0.0,
  1.8030021919912542e-13,
  0.0,
  1.5232259897857148e-13,
  1.616484723854228e-13,
  9.048919255971732e-05,
  1.0658141036401503e-14,
  0.9947669696187157,
  0.0,
  1.13464793116691e-13,
  0.0,
  1.2894849675815223e-06,
  0.9988133138366917,
  1.7696044629644803e-10,
  0.0,
  0.0071481991554653845,
  0.9976831049104847,
  0.0,
  0.0,
  4.218847493575595e-15,
  8.822206694958368e-06,
  1.84297022087776e-14,
  0.0,
  2.1871393585115584e-12,
  1.2745360322696797e-11,
  0.0,
  1.6608072026746967e-05,
  0.9999696244012928,
  4.400225961376236e-09,
  8.442357923854615e-12,
  0.0,
  7.216627295747458e-11,
  3.740341369962152e-12,
  0.0,
  0.999999992883269

In [146]:
{k: v.tolist() for k, v in result.iterrows()}


{0: [1.5587531265737198e-13, 0.9999999999998441, 1.0],
 1: [3.374887702634055e-09, 0.9999999966251123, 1.0],
 2: [0.0, 1.0, 1.0],
 3: [0.0, 1.0, 1.0],
 4: [0.0, 1.0, 1.0],
 5: [8.881784197001252e-16, 0.9999999999999991, 1.0],
 6: [0.0, 1.0, 1.0],
 7: [0.0, 1.0, 1.0],
 8: [0.0, 1.0, 1.0],
 9: [0.0, 1.0, 1.0],
 10: [6.689112819202592e-09, 0.9999999933108872, 1.0],
 11: [0.0, 1.0, 1.0],
 12: [4.341000806040718e-06, 0.999995658999194, 1.0],
 13: [5.825695481576076e-11, 0.999999999941743, 1.0],
 14: [0.0002600417144659195, 0.9997399582855341, 1.0],
 15: [0.0, 1.0, 1.0],
 16: [4.5078696331302126e-10, 0.999999999549213, 1.0],
 17: [0.020643838388880953, 0.979356161611119, 1.0],
 18: [0.9999999941233826, 5.87661740132661e-09, 0.0],
 19: [0.0, 1.0, 1.0],
 20: [0.0, 1.0, 1.0],
 21: [1.8030021919912542e-13, 0.9999999999998197, 1.0],
 22: [0.0, 1.0, 1.0],
 23: [1.5232259897857148e-13, 0.9999999999998477, 1.0],
 24: [1.616484723854228e-13, 0.9999999999998384, 1.0],
 25: [9.048919255971732e-05, 0.99

In [147]:
result

Unnamed: 0,Probability_0,Probability_1,Outcome
0,1.558753e-13,1.000000,1
1,3.374888e-09,1.000000,1
2,0.000000e+00,1.000000,1
3,0.000000e+00,1.000000,1
4,0.000000e+00,1.000000,1
...,...,...,...
57,1.428438e-06,0.999999,1
58,0.000000e+00,1.000000,1
59,6.966472e-06,0.999993,1
60,4.025669e-13,1.000000,1


In [128]:
result_dict = result.to_dict('series')
type(result_dict["Probability_0"])

pandas.core.series.Series

In [133]:
type(list(model.predict(X_test_proc)))

list

In [13]:
# Predict test data
outcome = pd.DataFrame(model.predict(X_test_proc), columns=["Outcome"])
prob = pd.DataFrame(model.predict_proba(X_test_proc), columns=["Probability_0", "Probability_1"])

# Merge results into dataframe
result = pd.merge(prob,outcome, left_index=True, right_index=True)

# Create prediction endpoint for API

## Prediction endpoint for one line of prediction

In [152]:
def predict_one(
            path: str, #"/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data"
            file: str, # "Glioma-clinic-TCGA-proteins-test-with-identifier.csv"
            ):
    """
    Make a prediction for every row in your dataset.
    Input needs to be a csv file with rows = samples and columns = proteins.
    The first row has to contain the protein names or any other identifier that will serve as a header.
    If your file has a sample identifier column, name this column "Identifier"
    """

    df = pd.read_csv(f"{path}/{file}", header=0)

    if 'Identifier' in df.columns:
        X_pred = df.drop(["Identifier"], axis = 1)
    else:
        X_pred = df


    # Preprocess
    # Load scaler
    scaler = load_scaler(path = '/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/saved_scalers',
                         file = 'MinMax_20240306-102844.joblib'
                        )

    X_pred_proc = scaler.transform(X_pred)

    # Predict data
    model = load_model()

    outcome_num = int(model.predict(X_pred_proc)[0])
    if outcome_num == 0:
        outcome = "good cancer"
        probability = round(float(model.predict_proba(X_pred_proc)[0][0]), 10)
    else:
        outcome = "bad cancer"
        probability = round(float(model.predict_proba(X_pred_proc)[0][1]), 10)

    return {
                "Outcome": outcome,
                "Probability": probability
            }

In [154]:
predict_one(path = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data",
       file = "Glioma-clinic-TCGA-proteins-test-with-identifier-outcome1.csv")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


{'Outcome': 'bad cancer', 'Probability': 1.0}

In [50]:
model = load_model()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [158]:
tmp1 = df[274:275].drop(cols_to_drop, axis = 1)
outcome_num = int(model.predict(tmp1)[0])
if outcome_num == 0:
    outcome = "good cancer"
    probability = round(float(model.predict_proba(tmp1)[0][0]), 4)
else:
    outcome = "bad cancer"
    probability = round(float(model.predict_proba(tmp1)[0][1]), 4)



In [159]:
{
    "Outcome": outcome,
    "Probability": probability
}

{'Outcome': 'good cancer', 'Probability': 1.0}

In [79]:
prob = float(model.predict_proba(tmp1)[0][1])
prob



1.180455213004863e-08

##  Prediction endpoint that should return a dataframe in case the user wants to do several predictions

In [25]:

def predict(
            path: str, #"/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data"
            file: str, # "Glioma-clinic-TCGA-proteins-test-with-identifier.csv"
            ):
    """
    Make a prediction for every row in your dataset.
    Input needs to be a csv file with rows = samples and columns = proteins.
    The first row has to contain the protein names or any other identifier that will serve as a header.
    If your file has a sample identifier column, name this column "Identifier"
    """

    df = pd.read_csv(f"{path}/{file}", header=0)

    if 'Identifier' in df.columns:
        X_pred = df.drop(["Identifier"], axis = 1)
    else:
        X_pred = df


    # Preprocess
    # Load scaler
    scaler = load_scaler(path = '/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/api/saved_scalers',
                         file = 'MinMax_20240306-102844.joblib'
                        )

    X_pred_proc = scaler.transform(X_pred)

    # Predict data
    model = load_model()

    outcome = pd.DataFrame(model.predict(X_pred_proc), columns=["Outcome"], dtype= int)
    prob = pd.DataFrame(model.predict_proba(X_pred_proc), columns=["Probability_0", "Probability_1"], dtype=float)

    # Merge results into dataframe
    result = pd.merge(prob,outcome, left_index=True, right_index=True)
    result_dict = result.to_dict('series')

    return result_dict


In [26]:
result = predict(path = "/home/jana/code/Klara-haas/brain_proteomics_project/brain_proteomics/raw_data",
       file = "Glioma-clinic-TCGA-proteins-test-with-identifier.csv")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [27]:
type(result)

dict

In [28]:
result

{'Probability_0': 0     1.660807e-05
 1     0.000000e+00
 2     0.000000e+00
 3     0.000000e+00
 4     0.000000e+00
           ...     
 57    0.000000e+00
 58    0.000000e+00
 59    1.000000e+00
 60    0.000000e+00
 61    1.680323e-11
 Name: Probability_0, Length: 62, dtype: float64,
 'Probability_1': 0     9.999834e-01
 1     1.000000e+00
 2     1.000000e+00
 3     1.000000e+00
 4     1.000000e+00
           ...     
 57    1.000000e+00
 58    1.000000e+00
 59    7.116731e-09
 60    1.000000e+00
 61    1.000000e+00
 Name: Probability_1, Length: 62, dtype: float64,
 'Outcome': 0     1
 1     1
 2     1
 3     1
 4     1
      ..
 57    1
 58    1
 59    0
 60    1
 61    1
 Name: Outcome, Length: 62, dtype: int64}