# Step 1
In your Python environment, you should have at least installed the following packages: joblib, numpy, pandas, and scikit-learn. Generally, we recommend installing the Anaconda3 environment as it includes these commonly used data science packages. **Please ensure that the model file "SVM_model.joblib" is located in the same folder as the code you intend to execute.**

# Step 2 Load package

In [None]:
import joblib
import numpy as np
import pandas as pd
import os

# Step 3 Run the following code

Please directly execute the following code, which consists of two parts: "data_processing" for data processing function, and "model_prediction" for data prediction function.

In [None]:
def data_processing(filepath,
                    log2 = True):

    print('Please note that the input data should be for these 10 genes \n'
          '(ARHGAP11A, CHAF1B, DEPDC1B, ECT2, GINS1, GTSE1, LMNB1, MYBL2, RFC4, TTK), \n'
          'RNA expression TPM values. By default, it will undergo lo2 processing. \n'
          'If the data you input has already undergone log2 processing.\n'
          ' Please select the parameter log2=FALSE')
    # read CSV file
    df = pd.read_csv(filepath,index_col=0)
    Gene = ['ARHGAP11A',
            'CHAF1B',
            'DEPDC1B',
            'ECT2',
            'GINS1',
            'GTSE1',
            'LMNB1',
            'MYBL2',
            'RFC4',
            'TTK']
    df_index_list = df.index.map(str).tolist()

    if  all(elem in df_index_list for elem in Gene):
        df = df.loc[Gene]
    else:
        raise ValueError("Classification genes were not detected in the data.\n"
                         " Please check if the data contains classification genes")
    df = df.reindex(Gene)
    if log2 == True:
        df = np.log2(df + 1)
    m = np.nanmean(df, axis=1)
    s = np.nanstd(df, axis=1)
    df = (df - m[:, np.newaxis]) / s[:, np.newaxis]
    result = df.T
    return result

def model_prediction(data,outputpath,outpultfilename):
    SVM_model = joblib.load('SVM_model.joblib')
    # The probability of labeling the predicted sample
    y_pred_proba = SVM_model.predict_proba(data)
    # The probability of extracting a prediction as CLASS A and CLASS B
    proba_class_a = y_pred_proba[:, 0]
    proba_class_b = y_pred_proba[:, 1]
    # The labels of the predicted samples
    y_pred_labels = SVM_model.predict(data)
    # Create a DataFrame containing the prediction results
    result_df = pd.DataFrame({
        'Sample': data.index,  
        'Probability_ClassA': proba_class_a,
        'Probability_ClassB': proba_class_b,
        'Prediction': y_pred_labels
    })
    # Save the resulting DataFrame to a CSV file
    outputpath = os.path.join(outputpath, outpultfilename) 
    result_df.to_csv(outputpath, index=False)


# Step4 Define file input and output paths
Please convert your expression matrix to .csv format, where rows represent Gene Symbols and columns represent samples. Assign the file path of this .csv file to "input_data_path". Specify the path where you want to save the result file in "output_path". Define the name of the result file in "output_filename". Please note that the data format used for model training is log2(TPM+1). To ensure more accurate results, your data should ideally be in TPM format and transformed using log2. If your data has not been log2-transformed, please set log2 = TRUE in the "data_processing" function.


In [None]:
if __name__ == "__main__":
    # Enter the gene expression matrix path below
    input_data_path = '~test_data.csv'
    # Please enter the path to save the written file
    output_path = '~'
    # Please define the result file save name below
    outpultfilename = 'Subtype_result.csv'
    data = data_processing(filepath=input_data_path,log2=False)
    print(data)
    model_prediction(data,outputpath = output_path,outpultfilename = outpultfilename)