# This is a quick tutorial that tells you how to build an ACP prediction model using Deep Forest and perform cross-validation.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from deepforest import CascadeForestClassifier #import deep forest package
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,roc_auc_score

## Open your dataset

In [3]:
df_train = pd.read_csv("train_shuffle.csv")
df_test = pd.read_csv("test_shuffle.csv") #open your dataset

In [5]:
train_label = df_train.iloc[:,769].values
test_label = df_test.iloc[:,769].values # get the label

In [6]:
train_label

array([1, 1, 0, ..., 0, 0, 1])

## Get graphic features based on FEGS
### For information on how to obtain FEGS features please refer to https://sourceforge.net/projects/transcriptomeassembly/files/Feature%20Extraction/.

In [7]:
train_FEGS = df_train.iloc[:,770:1348] # get graphic features based on FEGS

In [8]:
train_FEGS

Unnamed: 0,FEGS1,FEGS2,FEGS3,FEGS4,FEGS5,FEGS6,FEGS7,FEGS8,FEGS9,FEGS10,...,FEGS569,FEGS570,FEGS571,FEGS572,FEGS573,FEGS574,FEGS575,FEGS576,FEGS577,FEGS578
0,0.862472,0.858559,0.861212,0.866835,0.859242,0.876654,0.864643,0.855288,0.855970,0.870847,...,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000
1,0.899889,0.897707,0.887703,0.894444,0.893668,0.906833,0.890952,0.890924,0.892923,0.901288,...,0.0,0.000000,0.0,0.024390,0.0,0.02439,0.0,0.0,0.000000,0.000
2,0.892754,0.897437,0.914393,0.902558,0.884704,0.892978,0.887523,0.916524,0.882252,0.886334,...,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000
3,0.862965,0.865415,0.856147,0.872687,0.858080,0.856213,0.858489,0.853601,0.852435,0.882787,...,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000
4,0.889551,0.892271,0.891683,0.889754,0.887587,0.888321,0.896057,0.885174,0.884923,0.891887,...,0.0,0.035714,0.0,0.035714,0.0,0.00000,0.0,0.0,0.000000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,0.908932,0.909511,0.904631,0.902442,0.895008,0.904105,0.908441,0.909818,0.902922,0.888404,...,0.0,0.000000,0.0,0.026316,0.0,0.00000,0.0,0.0,0.000000,0.000
1269,0.907216,0.909844,0.878660,0.885865,0.902149,0.886450,0.891369,0.880731,0.901164,0.887667,...,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000
1270,0.894292,0.892213,0.896257,0.889793,0.896890,0.893404,0.890391,0.899486,0.899784,0.889607,...,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.0,0.021739,0.000
1271,0.890522,0.891893,0.886431,0.890087,0.891972,0.878039,0.892679,0.875260,0.882717,0.898271,...,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000


In [9]:
train_FEGS = df_train.iloc[:,770:1348].values

## Get evolutionary information based on BLOSUM62

### For the code to obtain evolutionary information please refer to: https://github.com/Martinyao1998/GRDF/blob/main/BLOSUM62.ipynb

In [10]:
train_BLOSUM = df_train.iloc[:,1348:2348] ## get evolutionary information based on BLOSUM62

In [11]:
train_BLOSUM

Unnamed: 0,BLOSUM1,BLOSUM2,BLOSUM3,BLOSUM4,BLOSUM5,BLOSUM6,BLOSUM7,BLOSUM8,BLOSUM9,BLOSUM10,...,BLOSUM991,BLOSUM992,BLOSUM993,BLOSUM994,BLOSUM995,BLOSUM996,BLOSUM997,BLOSUM998,BLOSUM999,BLOSUM1000
0,0,-2,0,-1,-3,-2,-2,6,-2,-4,...,0,0,0,0,0,0,0,0,0,0
1,4,-1,-2,-2,0,-1,-1,0,-2,-1,...,0,0,0,0,0,0,0,0,0,0
2,-2,-2,-2,-3,-2,-1,-2,-3,2,-1,...,0,0,0,0,0,0,0,0,0,0
3,0,-2,0,-1,-3,-2,-2,6,-2,-4,...,0,0,0,0,0,0,0,0,0,0
4,0,-2,0,-1,-3,-2,-2,6,-2,-4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,-2,-3,-3,-3,-2,-3,-3,-3,-1,0,...,0,0,0,0,0,0,0,0,0,0
1269,-1,5,0,-2,-3,1,0,-2,0,-3,...,0,0,0,0,0,0,0,0,0,0
1270,0,-2,0,-1,-3,-2,-2,6,-2,-4,...,0,0,0,0,0,0,0,0,0,0
1271,-1,2,0,-1,-3,1,1,-2,-1,-3,...,0,0,0,0,0,0,0,0,0,0


In [12]:
train_BLOSUM = df_train.iloc[:,1348:2348].values

## Get binary profile features

### For the code to obtain evolutionary information please refer to: https://github.com/Martinyao1998/GRDF/blob/main/Binary%20Feature.ipynb

In [13]:
train_Binary = df_train.iloc[:,2348:3348] # get binary profile features

In [14]:
train_Binary

Unnamed: 0,Binary1,Binary2,Binary3,Binary4,Binary5,Binary6,Binary7,Binary8,Binary9,Binary10,...,Binary991,Binary992,Binary993,Binary994,Binary995,Binary996,Binary997,Binary998,Binary999,Binary1000
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1269,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1270,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1271,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train_Binary = df_train.iloc[:,2348:3348].values

## Construction of a combination of graphical features, evolutionary information and binary profiles

In [16]:
train_FEGS_BLOSUM_Binary = np.concatenate((train_FEGS,train_BLOSUM,train_Binary),axis=1)

## Output feature dimensions

In [18]:
train_FEGS_BLOSUM_Binary.shape

(1273, 2578)

## Cross validation

In [20]:
from sklearn.model_selection import KFold

In [21]:
kf = KFold(n_splits=5, shuffle=True)
param_distribution = {
    "n_trees" : [50,100,300,500,700],
    "n_estimators": [2,3]
}

best_score, best_n_trees, best_n_estimators = 0,0,0
for n_trees in param_distribution["n_trees"]:
    for n_estimators in param_distribution["n_estimators"]:
        print("Begin to search n_trees = {}, n_estimators = {}\n".format(n_trees,n_estimators))
        all_f1_scores = []
        for i,(train_index,val_index) in enumerate(kf.split(train_FEGS_BLOSUM_Binary)):
            xx_train, xx_val = train_FEGS_BLOSUM_Binary[train_index], train_FEGS_BLOSUM_Binary[val_index]
            yy_train, yy_val = train_label[train_index], train_label[val_index]            
            standardScaler = StandardScaler()
            standardScaler.fit(xx_train)
            xx_train_std = standardScaler.transform(xx_train)
            xx_val_std = standardScaler.transform(xx_val)
            
            model = CascadeForestClassifier(random_state=800,n_trees=n_trees, n_estimators=n_estimators) #construct deep forest-based model
            model.fit(xx_train_std,yy_train) # fit your model 
            all_f1_scores.append(f1_score(yy_val, model.predict(xx_val_std))) 
        score = np.mean(all_f1_scores)
        if score > best_score:
            best_score, best_n_trees, best_n_estimators = score, n_trees, n_estimators
            print("Update, best_score = {}, best_n_trees={}, best_n_estimators={}".format(best_score, best_n_trees, best_n_estimators))


Begin to search n_trees = 50, n_estimators = 2

[2022-09-01 19:33:20.209] Start to fit the model:
[2022-09-01 19:33:20.209] Fitting cascade layer = 0 
[2022-09-01 19:33:21.176] layer = 0  | Val Acc = 75.933 % | Elapsed = 0.967 s
[2022-09-01 19:33:21.179] Fitting cascade layer = 1 
[2022-09-01 19:33:21.903] layer = 1  | Val Acc = 75.639 % | Elapsed = 0.724 s
[2022-09-01 19:33:21.903] Early stopping counter: 1 out of 2
[2022-09-01 19:33:21.905] Fitting cascade layer = 2 
[2022-09-01 19:33:22.597] layer = 2  | Val Acc = 75.835 % | Elapsed = 0.692 s
[2022-09-01 19:33:22.597] Early stopping counter: 2 out of 2
[2022-09-01 19:33:22.597] Handling early stopping
[2022-09-01 19:33:22.598] The optimal number of layers: 1
[2022-09-01 19:33:22.598] Start to evalute the model:
[2022-09-01 19:33:22.612] Evaluating cascade layer = 0 
[2022-09-01 19:33:22.866] Start to fit the model:
[2022-09-01 19:33:22.866] Fitting cascade layer = 0 
[2022-09-01 19:33:23.832] layer = 0  | Val Acc = 74.460 % | Elapse

[2022-09-01 19:33:53.508] layer = 7  | Val Acc = 76.228 % | Elapsed = 0.972 s
[2022-09-01 19:33:53.510] Fitting cascade layer = 8 
[2022-09-01 19:33:54.488] layer = 8  | Val Acc = 75.442 % | Elapsed = 0.978 s
[2022-09-01 19:33:54.488] Early stopping counter: 1 out of 2
[2022-09-01 19:33:54.490] Fitting cascade layer = 9 
[2022-09-01 19:33:55.471] layer = 9  | Val Acc = 75.049 % | Elapsed = 0.981 s
[2022-09-01 19:33:55.471] Early stopping counter: 2 out of 2
[2022-09-01 19:33:55.471] Handling early stopping
[2022-09-01 19:33:55.472] The optimal number of layers: 8
[2022-09-01 19:33:55.473] Start to evalute the model:
[2022-09-01 19:33:55.486] Evaluating cascade layer = 0 
[2022-09-01 19:33:55.505] Evaluating cascade layer = 1 
[2022-09-01 19:33:55.525] Evaluating cascade layer = 2 
[2022-09-01 19:33:55.545] Evaluating cascade layer = 3 
[2022-09-01 19:33:55.566] Evaluating cascade layer = 4 
[2022-09-01 19:33:55.586] Evaluating cascade layer = 5 
[2022-09-01 19:33:55.605] Evaluating cas

[2022-09-01 19:34:36.257] Start to fit the model:
[2022-09-01 19:34:36.257] Fitting cascade layer = 0 
[2022-09-01 19:34:38.017] layer = 0  | Val Acc = 76.742 % | Elapsed = 1.760 s
[2022-09-01 19:34:38.025] Fitting cascade layer = 1 
[2022-09-01 19:34:39.311] layer = 1  | Val Acc = 76.546 % | Elapsed = 1.286 s
[2022-09-01 19:34:39.311] Early stopping counter: 1 out of 2
[2022-09-01 19:34:39.317] Fitting cascade layer = 2 
[2022-09-01 19:34:40.572] layer = 2  | Val Acc = 76.840 % | Elapsed = 1.255 s
[2022-09-01 19:34:40.578] Fitting cascade layer = 3 
[2022-09-01 19:34:41.849] layer = 3  | Val Acc = 75.074 % | Elapsed = 1.272 s
[2022-09-01 19:34:41.850] Early stopping counter: 1 out of 2
[2022-09-01 19:34:41.856] Fitting cascade layer = 4 
[2022-09-01 19:34:43.118] layer = 4  | Val Acc = 76.349 % | Elapsed = 1.262 s
[2022-09-01 19:34:43.118] Early stopping counter: 2 out of 2
[2022-09-01 19:34:43.118] Handling early stopping
[2022-09-01 19:34:43.119] The optimal number of layers: 3
[202

[2022-09-01 19:35:58.392] layer = 2  | Val Acc = 78.094 % | Elapsed = 4.129 s
[2022-09-01 19:35:58.399] Fitting cascade layer = 3 
[2022-09-01 19:36:02.791] layer = 3  | Val Acc = 77.505 % | Elapsed = 4.392 s
[2022-09-01 19:36:02.791] Early stopping counter: 1 out of 2
[2022-09-01 19:36:02.798] Fitting cascade layer = 4 
[2022-09-01 19:36:06.941] layer = 4  | Val Acc = 77.996 % | Elapsed = 4.143 s
[2022-09-01 19:36:06.942] Early stopping counter: 2 out of 2
[2022-09-01 19:36:06.942] Handling early stopping
[2022-09-01 19:36:06.946] The optimal number of layers: 3
[2022-09-01 19:36:06.950] Start to evalute the model:
[2022-09-01 19:36:06.964] Evaluating cascade layer = 0 
[2022-09-01 19:36:07.054] Evaluating cascade layer = 1 
[2022-09-01 19:36:07.142] Evaluating cascade layer = 2 
[2022-09-01 19:36:07.483] Start to fit the model:
[2022-09-01 19:36:07.483] Fitting cascade layer = 0 
[2022-09-01 19:36:13.078] layer = 0  | Val Acc = 75.737 % | Elapsed = 5.596 s
[2022-09-01 19:36:13.086] F

[2022-09-01 19:39:07.493] layer = 1  | Val Acc = 76.840 % | Elapsed = 6.090 s
[2022-09-01 19:39:07.502] Fitting cascade layer = 2 
[2022-09-01 19:39:13.609] layer = 2  | Val Acc = 77.036 % | Elapsed = 6.107 s
[2022-09-01 19:39:13.618] Fitting cascade layer = 3 
[2022-09-01 19:39:19.730] layer = 3  | Val Acc = 76.742 % | Elapsed = 6.112 s
[2022-09-01 19:39:19.730] Early stopping counter: 1 out of 2
[2022-09-01 19:39:19.738] Fitting cascade layer = 4 
[2022-09-01 19:39:25.648] layer = 4  | Val Acc = 76.644 % | Elapsed = 5.909 s
[2022-09-01 19:39:25.648] Early stopping counter: 2 out of 2
[2022-09-01 19:39:25.648] Handling early stopping
[2022-09-01 19:39:25.654] The optimal number of layers: 3
[2022-09-01 19:39:25.660] Start to evalute the model:
[2022-09-01 19:39:25.689] Evaluating cascade layer = 0 
[2022-09-01 19:39:25.819] Evaluating cascade layer = 1 
[2022-09-01 19:39:25.923] Evaluating cascade layer = 2 
[2022-09-01 19:39:26.254] Start to fit the model:
[2022-09-01 19:39:26.254] F

[2022-09-01 19:43:06.473] layer = 2  | Val Acc = 76.719 % | Elapsed = 9.223 s
[2022-09-01 19:43:06.474] Early stopping counter: 1 out of 2
[2022-09-01 19:43:06.481] Fitting cascade layer = 3 
[2022-09-01 19:43:15.829] layer = 3  | Val Acc = 77.112 % | Elapsed = 9.348 s
[2022-09-01 19:43:15.829] Early stopping counter: 2 out of 2
[2022-09-01 19:43:15.829] Handling early stopping
[2022-09-01 19:43:15.840] The optimal number of layers: 2
[2022-09-01 19:43:15.858] Start to evalute the model:
[2022-09-01 19:43:15.876] Evaluating cascade layer = 0 
[2022-09-01 19:43:16.051] Evaluating cascade layer = 1 
[2022-09-01 19:43:16.454] Start to fit the model:
[2022-09-01 19:43:16.454] Fitting cascade layer = 0 
[2022-09-01 19:43:29.206] layer = 0  | Val Acc = 74.951 % | Elapsed = 12.752 s
[2022-09-01 19:43:29.215] Fitting cascade layer = 1 
[2022-09-01 19:43:38.465] layer = 1  | Val Acc = 76.424 % | Elapsed = 9.250 s
[2022-09-01 19:43:38.473] Fitting cascade layer = 2 
[2022-09-01 19:43:47.929] lay

[2022-09-01 19:48:32.826] Start to fit the model:
[2022-09-01 19:48:32.827] Fitting cascade layer = 0 
[2022-09-01 19:48:44.952] layer = 0  | Val Acc = 74.681 % | Elapsed = 12.125 s
[2022-09-01 19:48:44.959] Fitting cascade layer = 1 
[2022-09-01 19:48:53.754] layer = 1  | Val Acc = 75.564 % | Elapsed = 8.795 s
[2022-09-01 19:48:53.759] Fitting cascade layer = 2 
[2022-09-01 19:49:02.566] layer = 2  | Val Acc = 75.172 % | Elapsed = 8.807 s
[2022-09-01 19:49:02.566] Early stopping counter: 1 out of 2
[2022-09-01 19:49:02.572] Fitting cascade layer = 3 
[2022-09-01 19:49:11.407] layer = 3  | Val Acc = 75.172 % | Elapsed = 8.836 s
[2022-09-01 19:49:11.408] Early stopping counter: 2 out of 2
[2022-09-01 19:49:11.408] Handling early stopping
[2022-09-01 19:49:11.417] The optimal number of layers: 2
[2022-09-01 19:49:11.427] Start to evalute the model:
[2022-09-01 19:49:11.440] Evaluating cascade layer = 0 
[2022-09-01 19:49:11.611] Evaluating cascade layer = 1 
[2022-09-01 19:49:12.033] Sta

[2022-09-01 19:56:16.650] Evaluating cascade layer = 1 
Update, best_score = 0.7628760863534467, best_n_trees=700, best_n_estimators=3
