**Overview**

In this notebook, we applied the Elastic Net model on the proteomics data to predict the treatment groups (Placabo vs. GRF6021) at each time point V3, V4a, V4b and V5.

**Elastic Net Prediction at Each Timepoint (V3, V4a, V4b, V5)**

In [None]:
#data processing
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
import canopy
#EN model
from statannot import add_stat_annotation
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
#k-means and pathway analysis
from sklearn.manifold import TSNE
import networkx as nx
import plotly.graph_objects as go
from sklearn.cluster import KMeans
import colorcet as cc
import gseapy as gp

In [None]:
#process the data for EN model

def process_time_point(df, time_point):
    # Filter the DataFrame for the given time point and remove the first column
    df_withid = df[df['Time point'] == time_point].iloc[:, 1:]
    df_withid.reset_index(inplace=True,drop=True)
    # Extract features and apply log2 transformation
    X_df = df_withid.iloc[:, 2:]
    X_df = np.log2(X_df)
    
    # Standardize the features
    scaler = StandardScaler()
    X_df_stan = pd.DataFrame(scaler.fit_transform(X_df), index=X_df.index, columns=X_df.columns)
    
    # Map the 'Treatment' column to numerical values
    df_withid['Treatment'] = df_withid['Treatment'].map({'GRF6021': 1, 'Placebo': 2})
    
    # Extract the target variable
    y_pro = df_withid.iloc[:, 1]
    
    return X_df_stan, y_pro

# Time points to process
time_points = ['V3', 'V4a', 'V4b', 'V5']

# Dictionaries to hold the processed features and targets
X_pro_stan = {}
y_pro = {}

# Process each time point
#read the data
all_pro=pd.read_excel("Proteomics_all.xlsx")
for tp in time_points:
    X_pro_stan[tp], y_pro[tp] = process_time_point(all_pro, tp)


In [None]:
import re
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression

def make_predictions(X,y):
    # X: assume X is a dataframe of training data
    # y: y is the label of the data
    # 
    # return a dataframe of the ans 
    #
    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]
    rows = max(X.index) + 1
    ans_matrix = []
    auc_roc = []
    feature_importance={}
    #Iterate 100 experiments, in each experiment, we randomly select 50% of the data as training data and the rest 50% as testing data.
    for i in range(100):
        # a ans_list is a single column in the orignial ans matrix
        ans_list = [None] * rows
        df_train = X.sample(frac=0.5)
        df_test = X.drop(df_train.index)
        # initiate an empty model   
        # define model
        reg = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.9)
        # train the model        
        reg.fit(df_train, y[df_train.index])
        # use the trained model to make predictions
        df_test_label = reg.predict(df_test)
        importance=reg.coef_[0]
        # summarize feature importance
        
        for i,v in enumerate(importance):
            if i not in feature_importance:
                feature_importance[i]=[]
            feature_importance[i].append(v)
        # assign the predicted results to the ans_list
        start = 0
        for j in df_test.index:
            ans_list[j] = df_test_label[start]
            start += 1
            
        ans_matrix.append(ans_list)
    
    ans = np.array(ans_matrix)
    ans = np.transpose(ans)
    df_ans = pd.DataFrame(ans)
    
    return df_ans,feature_importance

In [None]:
# Initialize dictionaries to store the predictions, feature importances, and result DataFrames
ans = {}
fi = {}
df_result = {}
df_withid = {}

# Loop through each time point, make predictions, and prepare the result DataFrame
time_points=['V3','V4a','V4b','V5']

for tp in time_points:
    
    # Making predictions
    ans[tp], fi[tp] = make_predictions(X_pro_stan[tp], y_pro[tp])
    ans[tp]['mean'] = ans[tp].mean(axis=1) 
    
    # Concatenating the 'Treatment' and 'mean' data
    df_withid[tp] = all_pro[all_pro['Time point'] == tp]
    df_withid[tp].reset_index(inplace=True,drop=True)
    df_result[tp] = pd.concat([df_withid[tp]['Treatment'], ans[tp]['mean']], axis=1)


In [None]:
#visualize the prediction results

fig, axs = plt.subplots(2, 2,figsize=(20,20))
fig.suptitle('Elastic Net Model of Proteomics Data Prediction Results',fontsize=25)
sns.set(style="whitegrid")

#V3
sns.boxplot(ax=axs[0,0],x="Treatment",y="mean",data=df_result['V3'],order=["GRF6021","Placebo"])
axs[0, 0].set_title('V3',fontsize=32)
axs[0, 0].set_ylim(0.8,2.4)
axs[0, 0].set_xlabel('Treatment', fontsize=20)
axs[0, 0].tick_params(axis='both', labelsize=20)
axs[0, 0].set_ylabel('Predicted value', fontsize=20)
add_stat_annotation(ax=axs[0,0], data=df_result_V3, x="Treatment", y="mean",
                    box_pairs=[("GRF6021", "Placebo")],
                    test='t-test_ind', text_format='simple', loc='inside', verbose=2)

#V4a
sns.boxplot(ax=axs[0,1],x="Treatment",y="mean",data=df_result['V4a'],order=["GRF6021","Placebo"])
axs[0, 1].set_title('V4a',fontsize=32)
axs[0, 1].set_ylim(0.8,2.4)
axs[0, 1].set_xlabel('Treatment', fontsize=20)
axs[0, 1].tick_params(axis='both', labelsize=20)
axs[0, 1].set_ylabel('Predicted value', fontsize=20)
add_stat_annotation(ax=axs[0,1], data=df_result_V4a, x="Treatment", y="mean",
                    box_pairs=[("GRF6021", "Placebo")],
                    test='t-test_ind', text_format='simple', loc='inside', verbose=2)
#V4b
sns.boxplot(ax=axs[1,0],x="Treatment",y="mean",data=df_result_V4b,order=["GRF6021","Placebo"])
axs[1, 0].set_title('V4b',fontsize=32)
axs[1, 0].set_ylim(0.8,2.4)
axs[1, 0].set_xlabel('Treatment', fontsize=20)
axs[1, 0].tick_params(axis='both', labelsize=20)
axs[1, 0].set_ylabel('Predicted value', fontsize=20)
add_stat_annotation(ax=axs[1,0], data=df_result['V4b'], x="Treatment", y="mean",
                    box_pairs=[("GRF6021", "Placebo")],
                    test='t-test_ind', text_format='simple', loc='inside', verbose=2)
#V5
sns.boxplot(ax=axs[1,1],x="Treatment",y="mean",data=df_result['V5'],order=["GRF6021","Placebo"])
axs[1, 1].set_title('V5',fontsize=32)
axs[1, 1].set_ylim(0.8,2.4)
axs[1, 1].set_xlabel('Treatment', fontsize=20)
axs[1, 1].tick_params(axis='both', labelsize=20)
axs[1, 1].set_ylabel('Predicted value', fontsize=20)
add_stat_annotation(ax=axs[1,1], data=df_result_V5, x="Treatment", y="mean",
                    box_pairs=[("GRF6021", "Placebo")],
                    test='t-test_ind', text_format='simple', loc='inside', verbose=2)

for ax in axs.flat:
    ax.set(ylabel='Prediction value')
    
plt.show()