# Predictions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn import metrics
from sklearn import dummy
import optuna

In [2]:
train1 = pd.read_csv("Data/train_version1_for_modelisation.csv")
train2 = pd.read_csv("Data/train_version2_for_modelisation.csv")

In [3]:
train1

Unnamed: 0,seq_id,protein_sequence,pH,tm,protein_length,groupe,A,C,D,E,...,M,N,P,Q,R,S,T,V,W,Y
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,341,0,45,1,13,30,...,8,5,18,6,25,11,14,37,4,3
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,286,1,28,0,10,52,...,2,6,8,22,30,14,12,13,3,3
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,497,2,50,9,27,32,...,6,15,20,25,31,33,30,30,3,16
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,265,3,20,5,19,29,...,2,9,16,9,10,16,19,14,3,4
4,5,AACFWRRTVIPKPPFRGISTTSARSTVMPAWVIDKYGKNEVLRFTQ...,7.0,48.4,380,4,33,4,16,19,...,11,13,19,8,16,22,25,41,10,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,549,21788,33,12,38,31,...,13,24,25,24,42,33,18,42,13,18
25173,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,469,21789,37,5,21,29,...,14,19,19,16,25,37,26,34,5,14
25174,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,128,21790,13,1,7,7,...,7,5,6,8,3,10,6,7,4,4
25175,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,593,21791,47,5,34,36,...,26,25,31,12,25,51,32,48,3,18


In [4]:
train2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,pH
0,1,1,1,1,9,1,1,1,10,1,...,0,0,0,0,0,0,0,0,0,7.0
1,1,1,1,3,6,4,13,10,7,12,...,0,0,0,0,0,0,0,0,0,7.0
2,1,1,1,5,16,17,13,15,1,17,...,0,0,0,0,0,0,0,0,0,7.0
3,1,1,1,16,6,10,15,17,1,8,...,0,0,0,0,0,0,0,0,0,7.0
4,1,1,2,5,19,15,15,17,18,8,...,0,0,0,0,0,0,0,0,0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,20,20,11,20,16,6,6,6,16,1,...,0,0,0,0,0,0,0,0,0,7.0
25173,20,20,12,3,14,7,15,10,16,16,...,0,0,0,0,0,0,0,0,0,7.0
25174,20,20,14,15,17,10,6,1,4,10,...,0,0,0,0,0,0,0,0,0,7.0
25175,20,20,16,5,16,3,12,8,17,17,...,0,0,0,0,0,0,0,0,0,7.0


In [5]:
X1 = train1.drop(['seq_id', 'protein_sequence', 'tm', 'protein_length'], axis=1).values
X2 = train2.values
y = train1['tm'].values

# Train1

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [31]:
std = StandardScaler()

In [32]:
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## Dummy regressor

In [9]:
model = dummy.DummyRegressor(strategy='mean')

In [10]:
model.fit(X_train_std, y_train)

In [11]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.855098499714646

In [12]:
model = dummy.DummyRegressor(strategy='median')

In [13]:
model.fit(X_train_std, y_train)

In [14]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.466706380725444

## XGboost

In [15]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_test_std)
    return metrics.mean_absolute_error(y_test, y_pred)

In [16]:
study = optuna.create_study(direction='minimize', study_name='regression')

[32m[I 2022-10-18 11:04:29,868][0m A new study created in memory with name: regression[0m


In [17]:
study.optimize(objective, n_trials=100)

[32m[I 2022-10-18 11:04:32,031][0m Trial 0 finished with value: 7.502060834901308 and parameters: {'max_depth': 1, 'learning_rate': 0.4484394831824747, 'n_estimators': 211, 'min_child_weight': 3, 'gamma': 0.7096761145233623, 'subsample': 0.9794064218915138, 'colsample_bytree': 0.5295995414225496, 'reg_alpha': 0.5484597577871899, 'reg_lambda': 0.8594224806981428, 'random_state': 502}. Best is trial 0 with value: 7.502060834901308.[0m
[32m[I 2022-10-18 11:04:34,053][0m Trial 1 finished with value: 7.756699773360725 and parameters: {'max_depth': 8, 'learning_rate': 0.3152747009038597, 'n_estimators': 782, 'min_child_weight': 1, 'gamma': 0.03190979270082277, 'subsample': 0.41150332362286973, 'colsample_bytree': 0.15217264859984606, 'reg_alpha': 0.85181894546101, 'reg_lambda': 0.44618567119642055, 'random_state': 970}. Best is trial 0 with value: 7.502060834901308.[0m
[32m[I 2022-10-18 11:04:34,500][0m Trial 2 finished with value: 7.785766776895158 and parameters: {'max_depth': 10, 

[32m[I 2022-10-18 11:04:56,788][0m Trial 19 finished with value: 6.25336059548093 and parameters: {'max_depth': 3, 'learning_rate': 0.2096887994622868, 'n_estimators': 881, 'min_child_weight': 10, 'gamma': 0.6563403078854129, 'subsample': 0.6149013474303258, 'colsample_bytree': 0.6534640746689604, 'reg_alpha': 0.22124209199855485, 'reg_lambda': 0.9968413406787375, 'random_state': 800}. Best is trial 3 with value: 6.167040492505475.[0m
[32m[I 2022-10-18 11:04:57,823][0m Trial 20 finished with value: 6.097280360720036 and parameters: {'max_depth': 7, 'learning_rate': 0.02347321303200387, 'n_estimators': 514, 'min_child_weight': 4, 'gamma': 0.8500716822367524, 'subsample': 0.08561959434071628, 'colsample_bytree': 0.8466756484600221, 'reg_alpha': 0.9715586828219526, 'reg_lambda': 0.21946780931057078, 'random_state': 955}. Best is trial 20 with value: 6.097280360720036.[0m
[32m[I 2022-10-18 11:04:58,820][0m Trial 21 finished with value: 6.281313554502715 and parameters: {'max_depth'

[32m[I 2022-10-18 11:05:19,749][0m Trial 38 finished with value: 6.310922188136449 and parameters: {'max_depth': 10, 'learning_rate': 0.3000670839029721, 'n_estimators': 191, 'min_child_weight': 1, 'gamma': 0.9367009078337224, 'subsample': 0.9784122590401708, 'colsample_bytree': 0.5127125192763956, 'reg_alpha': 0.7618398549521279, 'reg_lambda': 0.5077438329061438, 'random_state': 178}. Best is trial 36 with value: 5.855549211479351.[0m
[32m[I 2022-10-18 11:05:20,526][0m Trial 39 finished with value: 8.727371221469893 and parameters: {'max_depth': 10, 'learning_rate': 0.9795877838440779, 'n_estimators': 116, 'min_child_weight': 3, 'gamma': 0.7310761101649105, 'subsample': 0.8652722168844323, 'colsample_bytree': 0.6038944537198735, 'reg_alpha': 0.7874389596295335, 'reg_lambda': 0.732131166311675, 'random_state': 25}. Best is trial 36 with value: 5.855549211479351.[0m
[32m[I 2022-10-18 11:05:20,808][0m Trial 40 finished with value: 7.868496953667804 and parameters: {'max_depth': 1

[32m[I 2022-10-18 11:05:34,744][0m Trial 57 finished with value: 5.874165091086608 and parameters: {'max_depth': 8, 'learning_rate': 0.09960316625050604, 'n_estimators': 346, 'min_child_weight': 1, 'gamma': 0.5144909803747577, 'subsample': 0.8351556345047582, 'colsample_bytree': 0.6472206425442142, 'reg_alpha': 0.7981634607660694, 'reg_lambda': 0.6018813149180237, 'random_state': 144}. Best is trial 36 with value: 5.855549211479351.[0m
[32m[I 2022-10-18 11:05:36,133][0m Trial 58 finished with value: 5.952331297795925 and parameters: {'max_depth': 7, 'learning_rate': 0.1020843200368591, 'n_estimators': 360, 'min_child_weight': 3, 'gamma': 0.4845054155623307, 'subsample': 0.832173541229229, 'colsample_bytree': 0.656078573714278, 'reg_alpha': 0.8082349985965146, 'reg_lambda': 0.5862061140530462, 'random_state': 204}. Best is trial 36 with value: 5.855549211479351.[0m
[32m[I 2022-10-18 11:05:36,358][0m Trial 59 finished with value: 24.293771410076776 and parameters: {'max_depth': 8

[32m[I 2022-10-18 11:05:56,996][0m Trial 76 finished with value: 5.9697882057151554 and parameters: {'max_depth': 7, 'learning_rate': 0.11859263343224635, 'n_estimators': 288, 'min_child_weight': 2, 'gamma': 0.05650779030383705, 'subsample': 0.8342260144058542, 'colsample_bytree': 0.48761563416470755, 'reg_alpha': 0.6290887680197885, 'reg_lambda': 0.537902672581155, 'random_state': 520}. Best is trial 62 with value: 5.828553739956267.[0m
[32m[I 2022-10-18 11:05:58,205][0m Trial 77 finished with value: 6.033267761798078 and parameters: {'max_depth': 8, 'learning_rate': 0.014447453604304446, 'n_estimators': 303, 'min_child_weight': 1, 'gamma': 0.0854997841454471, 'subsample': 0.9430310566577887, 'colsample_bytree': 0.45934028812118594, 'reg_alpha': 0.4680783866820778, 'reg_lambda': 0.46642640967881, 'random_state': 474}. Best is trial 62 with value: 5.828553739956267.[0m
[32m[I 2022-10-18 11:05:59,260][0m Trial 78 finished with value: 5.938527543903943 and parameters: {'max_depth

[32m[I 2022-10-18 11:06:26,140][0m Trial 95 finished with value: 5.81920833299802 and parameters: {'max_depth': 9, 'learning_rate': 0.06200780840076763, 'n_estimators': 444, 'min_child_weight': 1, 'gamma': 0.4599454517831607, 'subsample': 0.8136852021168836, 'colsample_bytree': 0.667809404059085, 'reg_alpha': 0.2990643641110577, 'reg_lambda': 0.4702351842000482, 'random_state': 727}. Best is trial 82 with value: 5.80613901225669.[0m
[32m[I 2022-10-18 11:06:28,944][0m Trial 96 finished with value: 5.9667880012081955 and parameters: {'max_depth': 9, 'learning_rate': 0.10977856482418985, 'n_estimators': 514, 'min_child_weight': 2, 'gamma': 0.4698391601887956, 'subsample': 0.812853473376808, 'colsample_bytree': 0.6656884643642964, 'reg_alpha': 0.24846612187147532, 'reg_lambda': 0.4639358648556637, 'random_state': 755}. Best is trial 82 with value: 5.80613901225669.[0m
[32m[I 2022-10-18 11:06:31,200][0m Trial 97 finished with value: 6.2541048240813115 and parameters: {'max_depth': 9

In [18]:
print('Best parameters', study.best_params)
print('Best value', study.best_value)

Best parameters {'max_depth': 8, 'learning_rate': 0.04713713722022711, 'n_estimators': 389, 'min_child_weight': 1, 'gamma': 0.1100406244545197, 'subsample': 0.9346110357674399, 'colsample_bytree': 0.5317297888481654, 'reg_alpha': 0.5338393483166887, 'reg_lambda': 0.5720506850472663, 'random_state': 550}
Best value 5.80613901225669


In [35]:
b = study.best_params

In [40]:
model = xgb.XGBRegressor(
    max_depth=b['max_depth'],
    learning_rate=b['learning_rate'],
    n_estimators=b['n_estimators'],
    min_child_weight=b['min_child_weight'],
    gamma=b['gamma'],
    subsample=b['subsample'],
    colsample_bytree=b['colsample_bytree'],
    reg_alpha=b['reg_alpha'],
    reg_lambda=b['reg_lambda'],
    random_state=b['random_state']
            )


In [41]:
model.fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.80613901225669

In [43]:
metrics.r2_score(y_test, y_pred)

0.5906031032021519

## Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train_std, y_train)

In [21]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.854546746352565

In [None]:
metrics.r2_score(y_test, y_pred)

# Train2

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [23]:
std = StandardScaler()

In [24]:
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## XGboost

In [25]:
study_2 = optuna.create_study(direction='minimize', study_name='regression_2')

[32m[I 2022-10-18 11:11:54,236][0m A new study created in memory with name: regression_2[0m


In [26]:
study_2.optimize(objective, n_trials=100)

[32m[I 2022-10-18 11:12:33,349][0m Trial 0 finished with value: 7.938099741209807 and parameters: {'max_depth': 9, 'learning_rate': 0.01636243900761896, 'n_estimators': 922, 'min_child_weight': 6, 'gamma': 0.5248812293199041, 'subsample': 0.4155742080746963, 'colsample_bytree': 0.5729519931411882, 'reg_alpha': 0.12676899878911654, 'reg_lambda': 0.23795141761757876, 'random_state': 762}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:12:34,441][0m Trial 1 finished with value: 8.507346477412597 and parameters: {'max_depth': 1, 'learning_rate': 0.48293675113069107, 'n_estimators': 69, 'min_child_weight': 10, 'gamma': 0.6098217946540436, 'subsample': 0.31063049279172306, 'colsample_bytree': 0.38559842301063124, 'reg_alpha': 0.8278418885945431, 'reg_lambda': 0.6384916280327305, 'random_state': 518}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:12:35,687][0m Trial 2 finished with value: 10.906788186620458 and parameters: {'max_depth':

[32m[I 2022-10-18 11:18:53,970][0m Trial 19 finished with value: 33.898718563082205 and parameters: {'max_depth': 7, 'learning_rate': 0.462429101543111, 'n_estimators': 998, 'min_child_weight': 5, 'gamma': 0.4119796018875464, 'subsample': 0.23203321250271192, 'colsample_bytree': 0.866467300631279, 'reg_alpha': 0.6749307744519907, 'reg_lambda': 0.11585900527881238, 'random_state': 194}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:19:05,872][0m Trial 20 finished with value: 11.609653253180658 and parameters: {'max_depth': 2, 'learning_rate': 0.7647111503383774, 'n_estimators': 814, 'min_child_weight': 3, 'gamma': 0.7631907152137491, 'subsample': 0.422036133381956, 'colsample_bytree': 0.5100537541227164, 'reg_alpha': 0.4286563445640358, 'reg_lambda': 0.5741503794132414, 'random_state': 991}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:19:25,650][0m Trial 21 finished with value: 8.160274246504676 and parameters: {'max_depth': 7,

[32m[I 2022-10-18 11:25:47,776][0m Trial 38 finished with value: 8.123805188034337 and parameters: {'max_depth': 8, 'learning_rate': 0.06689202654468124, 'n_estimators': 230, 'min_child_weight': 10, 'gamma': 0.6614783014876277, 'subsample': 0.5810163090903219, 'colsample_bytree': 0.22511135719191697, 'reg_alpha': 0.7291441789351244, 'reg_lambda': 0.4503795289904036, 'random_state': 58}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:25:50,393][0m Trial 39 finished with value: 8.325568732023555 and parameters: {'max_depth': 6, 'learning_rate': 0.16727335126774812, 'n_estimators': 149, 'min_child_weight': 10, 'gamma': 0.732517978730217, 'subsample': 0.7477680560726249, 'colsample_bytree': 0.18863538702881857, 'reg_alpha': 0.9392014979385213, 'reg_lambda': 0.13448749185671027, 'random_state': 140}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:25:51,887][0m Trial 40 finished with value: 8.478683214740586 and parameters: {'max_depth'

[32m[I 2022-10-18 11:27:46,400][0m Trial 57 finished with value: 7.878467202028775 and parameters: {'max_depth': 10, 'learning_rate': 0.011494741369394244, 'n_estimators': 360, 'min_child_weight': 7, 'gamma': 0.04082952417735919, 'subsample': 0.30648408812264505, 'colsample_bytree': 0.64442689719172, 'reg_alpha': 0.5808069129761744, 'reg_lambda': 0.6150262433409881, 'random_state': 420}. Best is trial 55 with value: 7.851610139175669.[0m
[32m[I 2022-10-18 11:28:06,728][0m Trial 58 finished with value: 7.920663229195188 and parameters: {'max_depth': 10, 'learning_rate': 0.012697891970720838, 'n_estimators': 373, 'min_child_weight': 7, 'gamma': 0.032998329407936546, 'subsample': 0.2937742860896217, 'colsample_bytree': 0.7952699775294012, 'reg_alpha': 0.5120481661034986, 'reg_lambda': 0.6786421775097196, 'random_state': 291}. Best is trial 55 with value: 7.851610139175669.[0m
[32m[I 2022-10-18 11:28:26,885][0m Trial 59 finished with value: 3.3015683796938244e+16 and parameters: {'

[32m[I 2022-10-18 11:34:01,297][0m Trial 76 finished with value: 10.106240961429839 and parameters: {'max_depth': 10, 'learning_rate': 0.15796192881639579, 'n_estimators': 311, 'min_child_weight': 6, 'gamma': 0.19852147397664568, 'subsample': 0.20835609004789118, 'colsample_bytree': 0.604802947018276, 'reg_alpha': 0.30991942108679854, 'reg_lambda': 0.7476997504786214, 'random_state': 279}. Best is trial 71 with value: 7.845934655914427.[0m
[32m[I 2022-10-18 11:34:14,937][0m Trial 77 finished with value: 8.685231701924367 and parameters: {'max_depth': 9, 'learning_rate': 0.1879100911371729, 'n_estimators': 262, 'min_child_weight': 5, 'gamma': 0.07043133015036848, 'subsample': 0.5988286077140954, 'colsample_bytree': 0.7004156249977397, 'reg_alpha': 0.38537685785789727, 'reg_lambda': 0.5527904369607696, 'random_state': 921}. Best is trial 71 with value: 7.845934655914427.[0m
[32m[I 2022-10-18 11:34:33,687][0m Trial 78 finished with value: 8.961023353312045 and parameters: {'max_de

[32m[I 2022-10-18 11:39:56,734][0m Trial 95 finished with value: 8.190758527584546 and parameters: {'max_depth': 10, 'learning_rate': 0.07839926844608688, 'n_estimators': 238, 'min_child_weight': 8, 'gamma': 0.11153034392838804, 'subsample': 0.49236203301178094, 'colsample_bytree': 0.5636973851256997, 'reg_alpha': 0.2060799518010527, 'reg_lambda': 0.8242924772452809, 'random_state': 407}. Best is trial 92 with value: 7.789057683704832.[0m
[32m[I 2022-10-18 11:40:17,308][0m Trial 96 finished with value: 8.268164255507319 and parameters: {'max_depth': 10, 'learning_rate': 0.10916470755066335, 'n_estimators': 325, 'min_child_weight': 7, 'gamma': 0.0408024892375404, 'subsample': 0.6425317508174647, 'colsample_bytree': 0.7994342670328294, 'reg_alpha': 0.4577718711588507, 'reg_lambda': 0.7984085369736394, 'random_state': 329}. Best is trial 92 with value: 7.789057683704832.[0m
[32m[I 2022-10-18 11:40:34,201][0m Trial 97 finished with value: 7.996607050442463 and parameters: {'max_dep

In [27]:
print('Best parameters', study_2.best_params)
print('Best value', study_2.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.011938901196820607, 'n_estimators': 291, 'min_child_weight': 8, 'gamma': 0.1742812850916403, 'subsample': 0.5691026102586522, 'colsample_bytree': 0.8531643967447635, 'reg_alpha': 0.4724367703165903, 'reg_lambda': 0.7725122886280066, 'random_state': 348}
Best value 7.789057683704832
