<a href="https://colab.research.google.com/github/Mike-Skehan/AntibodyFvTm50Predictor/blob/main/07112022_RF_Jain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install biovec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biovec
  Downloading biovec-0.2.7.tar.gz (5.9 kB)
Collecting gensim==3.4.0
  Downloading gensim-3.4.0.tar.gz (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 88.4 MB/s 
Collecting pyfasta==0.5.2
  Downloading pyfasta-0.5.2.tar.gz (19 kB)
Building wheels for collected packages: biovec, gensim, pyfasta
  Building wheel for biovec (setup.py) ... [?25l[?25hdone
  Created wheel for biovec: filename=biovec-0.2.7-py3-none-any.whl size=3464 sha256=ccd964ff426d6dfb194a30e9d3b7cb1f378cd4939e3edb300922adb807f74ef5
  Stored in directory: /root/.cache/pip/wheels/5d/34/9e/8154c1f2d0999ba86d3e1ad839791a48065c083fcce033b0fc
  Building wheel for gensim (setup.py) ... [?25l[?25hdone
  Created wheel for gensim: filename=gensim-3.4.0-cp37-cp37m-linux_x86_64.whl size=23316706 sha256=73761ac31965ba5c5754ec1b7ce7d7f03ab6148be3458fb30c8fa697eea0fe78
  Stored in directory: /root/.cache/

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import biovec
import numpy as np
from sklearn.manifold import TSNE

In [5]:
def data_extract_Jain(data_file):
    df = pd.read_csv(data_file)
    df.drop([0])
    df.rename(columns={'VL': 'Light'}, inplace=True)
    df.rename(columns={'VH': 'Heavy'}, inplace=True)
    df.rename(columns={"Fab Tm by DSF (°C)": 'Temp'}, inplace=True)
    df.rename(columns={"LC Class": 'lc_class'}, inplace=True)

    light_seq = df['Light'].values.tolist()
    heavy_seq = df['Heavy'].values.tolist()
    temp = df['Temp'].values.tolist()
    lc_class = df['lc_class'].values.tolist()

    light_seq = remove_special_chars(light_seq)
    heavy_seq = remove_special_chars(heavy_seq)

    return light_seq, heavy_seq, temp, lc_class

def remove_special_chars(seq_list):
        chars = ' -?BJOUXZ'
        new_list = []
        for seq in seq_list:
            for char in chars:
                seq = seq.replace(char,'')
            new_list.append(seq)

        return new_list

In [10]:
pv = biovec.models.load_protvec('/content/swissprot-reviewed-protvec.model')

In [21]:
light, heavy, temp, lc_class = data_extract_Jain("/content/Jain_Ab_dataset.csv")

comb = [m+str(n) for m,n in zip(light,heavy)]

vec_list_input = []

for seq in comb:
  vec = sum(pv.to_vecs(seq))
  vec_list_input.append(vec)


protvec_array = np.vstack(vec_list_input)



In [22]:
protvec_50 = TSNE(n_components=50,learning_rate='auto', init = 'random', perplexity = 30,method = 'exact').fit_transform(protvec_array)
protvec_25 = TSNE(n_components=25,learning_rate='auto', init = 'random', perplexity = 30,method = 'exact').fit_transform(protvec_array)
protvec_10 = TSNE(n_components=10,learning_rate='auto', init = 'random', perplexity = 30,method = 'exact').fit_transform(protvec_array)
protvec_5 = TSNE(n_components=5,learning_rate='auto', init = 'random', perplexity = 30,method = 'exact').fit_transform(protvec_array)

protvec_list = [protvec_50, protvec_25, protvec_10, protvec_5]



In [25]:
protvec_50.shape

(137, 50)

In [26]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [27]:
X = protvec_50
Y = temp

regr = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = regr, 
                               param_distributions = random_grid, n_iter = 100, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)

#regr.fit(X, y) will fit with standard parameters, no tuning.

# Fit the random search model
rf_random.fit(X, Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [28]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X, Y)

random_accuracy



Model Performance
Average Error: 2.7499 degrees.
Accuracy = 96.13%.


96.12938621028582

In [33]:
protvec_50[1].reshape(1,-1)

array([[-8.2572684e-02,  1.3472493e+00, -2.5662796e+00,  3.6033824e-02,
        -7.6064795e-01,  1.1897956e+00, -7.1855474e-01, -1.2654673e-01,
         1.8794019e-03,  1.2601583e+00, -1.0268879e+00,  2.4121989e-01,
         1.1374704e+00,  7.8242183e-02, -1.5872706e-01, -9.3702734e-02,
        -1.7599037e+00, -2.9061165e+00, -4.2036226e-01, -1.9472915e+00,
        -8.5803509e-01, -1.4444261e+00, -2.1139173e-01, -8.8170201e-01,
        -4.3523222e-01, -1.1697849e+00, -1.6267255e-01, -4.4361144e-01,
        -4.8899060e-01,  6.0397232e-01, -1.1974762e-02,  3.3696363e+00,
        -2.5157781e+00, -3.8568184e-01,  3.2800910e-01, -1.4929995e-01,
        -4.8556663e-02, -2.6304170e-01,  4.7838572e-02, -9.1399544e-01,
        -1.0257664e+00, -5.8157825e-01,  4.0187892e-01, -3.9126489e-01,
         1.8891828e-01,  2.6599461e-01, -3.8999507e-01,  9.3776095e-01,
        -1.1975046e-01,  2.0998290e+00]], dtype=float32)

In [34]:
best_random.predict(protvec_50[1].reshape(1,-1))

array([71.19683743])