In [1]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle
from sklearn.svm import LinearSVR
import mlflow

Configure MLflow tracking

In [17]:
mlflow.set_experiment("/Shared/standard")

mlflow.set_tracking_uri("databricks")
mlflow.start_run()
mlflow.sklearn.autolog()

Load  data into a  DataFrame

In [18]:
mb_raw_data = pd.read_csv('/Users/mad_hatter/Desktop/Bioinfo/PBL/data/mann_bruker.txt', sep='\t')
mb_data_frame = pd.DataFrame(mb_raw_data)
# Delete all columns except Sequence, m/z, and CCS
mb_clean_frame = mb_data_frame[['Sequence', 'm/z', 'CCS']]
# Log transform CCS values
mb_clean_frame['CCS'] = mb_clean_frame['CCS'].apply(lambda x: np.log(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mb_clean_frame['CCS'] = mb_clean_frame['CCS'].apply(lambda x: np.log(x))


Train model

In [19]:
# Split the data into input (m/z) and output (CCS) variables
X = mb_clean_frame[['m/z']]
y = mb_clean_frame['CCS']
# Define the number of folds
k = 4

Perform k-fold cross validation


In [20]:
# Initialize the cross-validation object
kf = KFold(n_splits=k)

# Initialize a list to store the MSE for each fold
mse_scores = []
median_relative_errors = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create a random forest regressor model
    model = LinearSVR(epsilon=0, random_state=1, max_iter=3000)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the MSE for the current fold
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

    relative_errors = np.abs((y_pred - y_test) / y_test)
    median_relative_error = np.median(relative_errors)
    median_relative_errors.append(median_relative_error)

# Find the index of the fold with the lowest MSE
best_fold_index = mse_scores.index(min(mse_scores))
# Split the data into training and testing sets for the best fold
X_train_best, X_test_best = X.iloc[kf.split(X).__next__()[0]], X.iloc[kf.split(X).__next__()[1]]
y_train_best, y_test_best = y.iloc[kf.split(y).__next__()[0]], y.iloc[kf.split(y).__next__()[1]]

# Create the final model using the best fold
final_model = LinearSVR(random_state=1)
final_model.fit(X_train_best, y_train_best)

# Make predictions on the test set using the final model
y_pred_best = final_model.predict(X_test_best)

# Calculate the MSE for the best fold
best_mse = mean_squared_error(y_test_best, y_pred_best)
print("Best Mean Squared Error:", best_mse)
# Print all mse scores
print("All Mean Squared Errors:", mse_scores)
# Print the median of the relative errors for each fold
print("Median Relative Errors:", median_relative_errors)



Best Mean Squared Error: 0.2685347600499791
All Mean Squared Errors: [0.8419782732481226, 0.796652052882215, 0.2954254713004413, 0.29587782497202025]
Median Relative Errors: [0.009100448830507679, 0.008439782665500353, 0.0026004371084238477, 0.0027098946185706804]


In [16]:
mlflow.end_run()

In [6]:
# Input the m/z value of the peptide you want to predict the CCS of
mz = 1074.04135
# Predict the CCS of the peptide
ccs = final_model.predict([[mz]])
# Reverse the log transformation
ccs = np.exp(ccs)
print("Predicted CCS:", ccs)

Predicted CCS: [8.95680537e-36]




Save the model

In [8]:
# Save the model to a file
filename = '/Users/mad_hatter/Desktop/Bioinfo/PBL/models/SVR/LinearSVR.pkl'
pickle.dump(model, open(filename, 'wb'))