# Modélisation

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import xgboost as xgb
from sklearn import dummy
from sklearn.ensemble import RandomForestRegressor

import optuna
import timeit

A la fin de l'analyse exploratoire nous avions créé deux jeux de données différents. Nous allons tester des models de gradient boosting sur ces deux jeux de données afin de comparer les deux approches. Nous testerons un model de forêts aléatoires mais le temps de calcul étant beaucoup plus long que pour le gradient boosting nous ne chercherons pas à l'améliorer.

## Chargement des données

In [81]:
train1 = pd.read_csv("Data/train_version1_for_modelisation.csv")
train2 = pd.read_csv("Data/train_version2_for_modelisation.csv")

In [82]:
train1

Unnamed: 0,seq_id,protein_sequence,pH,tm,protein_length,groupe,A,C,D,E,...,M,N,P,Q,R,S,T,V,W,Y
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,341,0,45,1,13,30,...,8,5,18,6,25,11,14,37,4,3
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,286,1,28,0,10,52,...,2,6,8,22,30,14,12,13,3,3
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,497,2,50,9,27,32,...,6,15,20,25,31,33,30,30,3,16
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,265,3,20,5,19,29,...,2,9,16,9,10,16,19,14,3,4
4,5,AACFWRRTVIPKPPFRGISTTSARSTVMPAWVIDKYGKNEVLRFTQ...,7.0,48.4,380,4,33,4,16,19,...,11,13,19,8,16,22,25,41,10,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,549,21788,33,12,38,31,...,13,24,25,24,42,33,18,42,13,18
25173,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,469,21789,37,5,21,29,...,14,19,19,16,25,37,26,34,5,14
25174,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,128,21790,13,1,7,7,...,7,5,6,8,3,10,6,7,4,4
25175,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,593,21791,47,5,34,36,...,26,25,31,12,25,51,32,48,3,18


In [83]:
train2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,pH
0,1,1,1,1,9,1,1,1,10,1,...,0,0,0,0,0,0,0,0,0,7.0
1,1,1,1,3,6,4,13,10,7,12,...,0,0,0,0,0,0,0,0,0,7.0
2,1,1,1,5,16,17,13,15,1,17,...,0,0,0,0,0,0,0,0,0,7.0
3,1,1,1,16,6,10,15,17,1,8,...,0,0,0,0,0,0,0,0,0,7.0
4,1,1,2,5,19,15,15,17,18,8,...,0,0,0,0,0,0,0,0,0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,20,20,11,20,16,6,6,6,16,1,...,0,0,0,0,0,0,0,0,0,7.0
25173,20,20,12,3,14,7,15,10,16,16,...,0,0,0,0,0,0,0,0,0,7.0
25174,20,20,14,15,17,10,6,1,4,10,...,0,0,0,0,0,0,0,0,0,7.0
25175,20,20,16,5,16,3,12,8,17,17,...,0,0,0,0,0,0,0,0,0,7.0


In [84]:
X1 = train1.drop(['seq_id', 'protein_sequence', 'tm', 'protein_length'], axis=1).values
X2 = train2.values
y = train1['tm'].values

Nous testerons l'influence de la variable 'groupe' dans la première approche.

# Train1 avec la variable 'groupe'

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [86]:
std = StandardScaler()

In [87]:
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## Dummy regressor

Dans cette section nous allons tester deux approches naïves qui nous servirons de référence pour évaluer les performances des prochains models.

In [15]:
model = dummy.DummyRegressor(strategy='mean')

In [16]:
model.fit(X_train_std, y_train)

In [17]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.855098499714646

In [18]:
model = dummy.DummyRegressor(strategy='median')

In [19]:
model.fit(X_train_std, y_train)

In [20]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.466706380725444

On pourrait aussi utiliser le r2_score, mais le choix de la valeur absolue de l'erreur est pertinent puisqu'on prédit une température et que l'on peut comprendre facilement ce qu'est un écart de 8 degrés.

In [21]:
metrics.r2_score(y_test, y_pred)

-0.04997271922533386

Nous allons maintenant tester des models plus développés. 

## Gradient boosting avecXGboost

Nous allons effectuer une optimisation des hyperparamètres à l'aide du module optuna.

In [23]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_test_std)
    return metrics.mean_absolute_error(y_test, y_pred)

In [24]:
study_1 = optuna.create_study(direction='minimize', study_name='regression_1')

[32m[I 2022-10-22 14:50:17,529][0m A new study created in memory with name: regression_1[0m


In [25]:
study_1.optimize(objective, n_trials=100)

[32m[I 2022-10-22 14:50:24,196][0m Trial 0 finished with value: 8.05181732298932 and parameters: {'max_depth': 1, 'learning_rate': 0.5661277628179602, 'n_estimators': 617, 'min_child_weight': 8, 'gamma': 0.6827213218781594, 'subsample': 0.0610731683678508, 'colsample_bytree': 0.9030403149254735, 'reg_alpha': 0.5003649944189089, 'reg_lambda': 0.4001964992653163, 'random_state': 261}. Best is trial 0 with value: 8.05181732298932.[0m
[32m[I 2022-10-22 14:50:24,573][0m Trial 1 finished with value: 8.487653034541985 and parameters: {'max_depth': 2, 'learning_rate': 0.7048830212358432, 'n_estimators': 460, 'min_child_weight': 8, 'gamma': 0.15736246612403393, 'subsample': 0.09411488287562692, 'colsample_bytree': 0.09932574744288897, 'reg_alpha': 0.09759547483334483, 'reg_lambda': 0.08481731404774559, 'random_state': 731}. Best is trial 0 with value: 8.05181732298932.[0m
[32m[I 2022-10-22 14:50:26,721][0m Trial 2 finished with value: 7.924160047293277 and parameters: {'max_depth': 6, '

[32m[I 2022-10-22 14:51:06,830][0m Trial 19 finished with value: 8.703893674982266 and parameters: {'max_depth': 6, 'learning_rate': 0.47201125825952095, 'n_estimators': 851, 'min_child_weight': 5, 'gamma': 0.7933461581147757, 'subsample': 0.3886983143106607, 'colsample_bytree': 0.8001760122867223, 'reg_alpha': 0.6258720198765539, 'reg_lambda': 0.12721192466753284, 'random_state': 815}. Best is trial 17 with value: 5.7626888506937055.[0m
[32m[I 2022-10-22 14:51:10,046][0m Trial 20 finished with value: 6.043108645260634 and parameters: {'max_depth': 9, 'learning_rate': 0.162472441031655, 'n_estimators': 701, 'min_child_weight': 3, 'gamma': 0.84722917530906, 'subsample': 0.8977302213765488, 'colsample_bytree': 0.5386883977160246, 'reg_alpha': 0.8316544761679406, 'reg_lambda': 0.8660292392386144, 'random_state': 555}. Best is trial 17 with value: 5.7626888506937055.[0m
[32m[I 2022-10-22 14:51:14,450][0m Trial 21 finished with value: 5.7918317195376225 and parameters: {'max_depth':

[32m[I 2022-10-22 14:52:18,498][0m Trial 38 finished with value: 6.019550740254881 and parameters: {'max_depth': 7, 'learning_rate': 0.12855287552952155, 'n_estimators': 988, 'min_child_weight': 2, 'gamma': 0.7480037579438767, 'subsample': 0.9329646358897822, 'colsample_bytree': 0.9328785631573564, 'reg_alpha': 0.42633817148138686, 'reg_lambda': 0.30950202401132204, 'random_state': 408}. Best is trial 22 with value: 5.7470506496394105.[0m
[32m[I 2022-10-22 14:52:22,028][0m Trial 39 finished with value: 7.544362844479031 and parameters: {'max_depth': 8, 'learning_rate': 0.6292913054268986, 'n_estimators': 765, 'min_child_weight': 5, 'gamma': 0.5426692500692604, 'subsample': 0.8218765921310107, 'colsample_bytree': 0.7318083092621976, 'reg_alpha': 0.9621832011150018, 'reg_lambda': 0.38349232619492607, 'random_state': 606}. Best is trial 22 with value: 5.7470506496394105.[0m
[32m[I 2022-10-22 14:52:24,333][0m Trial 40 finished with value: 6.18752885479633 and parameters: {'max_dept

[32m[I 2022-10-22 14:53:26,276][0m Trial 57 finished with value: 5.768820358591481 and parameters: {'max_depth': 8, 'learning_rate': 0.043613514502769915, 'n_estimators': 818, 'min_child_weight': 4, 'gamma': 0.09197325388027286, 'subsample': 0.9562511161338351, 'colsample_bytree': 0.8870193834374572, 'reg_alpha': 0.596560896833479, 'reg_lambda': 0.9062999801104642, 'random_state': 219}. Best is trial 52 with value: 5.724417304033194.[0m
[32m[I 2022-10-22 14:53:30,635][0m Trial 58 finished with value: 11.59952828148338 and parameters: {'max_depth': 9, 'learning_rate': 0.746406036856221, 'n_estimators': 943, 'min_child_weight': 2, 'gamma': 0.5818513150114009, 'subsample': 0.45368527699316624, 'colsample_bytree': 0.8043145360382847, 'reg_alpha': 0.19848982732088066, 'reg_lambda': 0.7946972129663479, 'random_state': 80}. Best is trial 52 with value: 5.724417304033194.[0m
[32m[I 2022-10-22 14:53:35,768][0m Trial 59 finished with value: 5.995050138817409 and parameters: {'max_depth':

[32m[I 2022-10-22 14:54:54,496][0m Trial 76 finished with value: 5.9631365480390155 and parameters: {'max_depth': 9, 'learning_rate': 0.1481244356825711, 'n_estimators': 911, 'min_child_weight': 5, 'gamma': 0.1683200315740254, 'subsample': 0.964942637090319, 'colsample_bytree': 0.9356136754707088, 'reg_alpha': 0.5815640701627223, 'reg_lambda': 0.8670808691637272, 'random_state': 290}. Best is trial 52 with value: 5.724417304033194.[0m
[32m[I 2022-10-22 14:54:58,367][0m Trial 77 finished with value: 5.792672592841793 and parameters: {'max_depth': 8, 'learning_rate': 0.07076022282741226, 'n_estimators': 721, 'min_child_weight': 4, 'gamma': 0.23170902829885576, 'subsample': 0.9121637127162817, 'colsample_bytree': 0.8953182652264678, 'reg_alpha': 0.701241217241548, 'reg_lambda': 0.823089480672095, 'random_state': 140}. Best is trial 52 with value: 5.724417304033194.[0m
[32m[I 2022-10-22 14:55:01,263][0m Trial 78 finished with value: 5.905322800914794 and parameters: {'max_depth': 6

[32m[I 2022-10-22 14:56:38,635][0m Trial 95 finished with value: 5.736019125583154 and parameters: {'max_depth': 10, 'learning_rate': 0.011358467207813505, 'n_estimators': 931, 'min_child_weight': 7, 'gamma': 0.21313327493625167, 'subsample': 0.7163955346289127, 'colsample_bytree': 0.8556866892935733, 'reg_alpha': 0.5791288557678719, 'reg_lambda': 0.727132725016316, 'random_state': 504}. Best is trial 87 with value: 5.691336594607612.[0m
[32m[I 2022-10-22 14:56:44,604][0m Trial 96 finished with value: 5.862104021092082 and parameters: {'max_depth': 10, 'learning_rate': 0.07054844126523842, 'n_estimators': 973, 'min_child_weight': 7, 'gamma': 0.23414006916878002, 'subsample': 0.7122310170590411, 'colsample_bytree': 0.8413827641674559, 'reg_alpha': 0.5158567866690974, 'reg_lambda': 0.7087688364171442, 'random_state': 505}. Best is trial 87 with value: 5.691336594607612.[0m
[32m[I 2022-10-22 14:56:50,752][0m Trial 97 finished with value: 5.730493496548161 and parameters: {'max_dep

In [26]:
print('Best parameters', study_1.best_params)
print('Best value', study_1.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.015409066931963497, 'n_estimators': 904, 'min_child_weight': 5, 'gamma': 0.16312188477330009, 'subsample': 0.8139398493653771, 'colsample_bytree': 0.9101968234237474, 'reg_alpha': 0.549700482542533, 'reg_lambda': 0.93819549261081, 'random_state': 631}
Best value 5.691336594607612


## Random Forest

L'entraînement d'un model de forêt aléatoire avec beaucoup d'estimateurs est extrêmement long donc nous n'allons pas faire d'optimisation des hyperparamètres.

In [27]:
start_time = timeit.default_timer()

model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train_std, y_train)

temps = timeit.default_timer() - start_time
temps

120.63957860000028

In [28]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.865982025911021

On est très proche de notre meilleur model avec XGboost.

# Train1 sans la variable 'groupe'

In [42]:
X1_sans_groupe = np.delete(X1, np.s_[1], axis=1)
X1_sans_groupe

array([[ 7., 45.,  1., ..., 37.,  4.,  3.],
       [ 7., 28.,  0., ..., 13.,  3.,  3.],
       [ 7., 50.,  9., ..., 30.,  3., 16.],
       ...,
       [ 7., 13.,  1., ...,  7.,  4.,  4.],
       [ 7., 47.,  5., ..., 48.,  3., 18.],
       [ 7., 34.,  5., ..., 38., 18., 29.]])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X1_sans_groupe, y, random_state=21, train_size=0.7, shuffle=True)

In [46]:
std = StandardScaler()
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [47]:
study_1_ss_grp = optuna.create_study(direction='minimize', study_name='regression_1_ss_grp')

[32m[I 2022-10-22 15:14:57,524][0m A new study created in memory with name: regression_1_ss_grp[0m


In [48]:
study_1_ss_grp.optimize(objective, n_trials=100)

[32m[I 2022-10-22 15:15:15,951][0m Trial 0 finished with value: 10.205928891785158 and parameters: {'max_depth': 6, 'learning_rate': 0.4279611619248781, 'n_estimators': 748, 'min_child_weight': 5, 'gamma': 0.786218932936145, 'subsample': 0.266904233150719, 'colsample_bytree': 0.7739236832681218, 'reg_alpha': 0.8878607374587574, 'reg_lambda': 0.10600750432472929, 'random_state': 269}. Best is trial 0 with value: 10.205928891785158.[0m
[32m[I 2022-10-22 15:15:16,378][0m Trial 1 finished with value: 6.602919100376862 and parameters: {'max_depth': 2, 'learning_rate': 0.16842077584574638, 'n_estimators': 333, 'min_child_weight': 10, 'gamma': 0.9897199556939676, 'subsample': 0.48262854847997627, 'colsample_bytree': 0.9564192314114903, 'reg_alpha': 0.4041638896595316, 'reg_lambda': 0.4445489619824295, 'random_state': 555}. Best is trial 1 with value: 6.602919100376862.[0m
[32m[I 2022-10-22 15:15:16,604][0m Trial 2 finished with value: 9.197832475880766 and parameters: {'max_depth': 5,

[32m[I 2022-10-22 15:15:32,020][0m Trial 19 finished with value: 6.327227741591919 and parameters: {'max_depth': 4, 'learning_rate': 0.26963404114763534, 'n_estimators': 764, 'min_child_weight': 9, 'gamma': 0.4364687244407948, 'subsample': 0.5741802256483555, 'colsample_bytree': 0.29365967973740975, 'reg_alpha': 0.5125652056854367, 'reg_lambda': 0.9975119470540967, 'random_state': 393}. Best is trial 8 with value: 6.017301548489951.[0m
[32m[I 2022-10-22 15:15:32,556][0m Trial 20 finished with value: 6.421912487556229 and parameters: {'max_depth': 7, 'learning_rate': 0.015093750927992994, 'n_estimators': 215, 'min_child_weight': 3, 'gamma': 0.9348018837968444, 'subsample': 0.2942476776933976, 'colsample_bytree': 0.5165448740343411, 'reg_alpha': 0.11888788906478404, 'reg_lambda': 0.5344666251855454, 'random_state': 110}. Best is trial 8 with value: 6.017301548489951.[0m
[32m[I 2022-10-22 15:15:33,000][0m Trial 21 finished with value: 5.983475334100821 and parameters: {'max_depth'

[32m[I 2022-10-22 15:15:49,148][0m Trial 38 finished with value: 9.800466104825086 and parameters: {'max_depth': 6, 'learning_rate': 0.9025328972068472, 'n_estimators': 175, 'min_child_weight': 5, 'gamma': 0.6418028944401086, 'subsample': 0.537157082788484, 'colsample_bytree': 0.2527104283488889, 'reg_alpha': 0.43676886361592737, 'reg_lambda': 0.023056309494599525, 'random_state': 937}. Best is trial 31 with value: 5.945157943909402.[0m
[32m[I 2022-10-22 15:15:49,803][0m Trial 39 finished with value: 6.925753256010258 and parameters: {'max_depth': 4, 'learning_rate': 0.5829029784875048, 'n_estimators': 386, 'min_child_weight': 9, 'gamma': 0.9009777060778429, 'subsample': 0.6402661558591335, 'colsample_bytree': 0.4831467731582743, 'reg_alpha': 0.21180238658381334, 'reg_lambda': 0.6278177595174695, 'random_state': 595}. Best is trial 31 with value: 5.945157943909402.[0m
[32m[I 2022-10-22 15:15:50,165][0m Trial 40 finished with value: 7.435697533423414 and parameters: {'max_depth'

[32m[I 2022-10-22 15:16:16,957][0m Trial 57 finished with value: 6.094918632810319 and parameters: {'max_depth': 9, 'learning_rate': 0.13956896762571774, 'n_estimators': 722, 'min_child_weight': 8, 'gamma': 0.8683580108812057, 'subsample': 0.7325243041113124, 'colsample_bytree': 0.8216785699767141, 'reg_alpha': 0.4923096513378735, 'reg_lambda': 0.9997129100728556, 'random_state': 965}. Best is trial 54 with value: 5.780154951558658.[0m
[32m[I 2022-10-22 15:16:20,902][0m Trial 58 finished with value: 5.82637216887652 and parameters: {'max_depth': 10, 'learning_rate': 0.05415589232959987, 'n_estimators': 887, 'min_child_weight': 7, 'gamma': 0.9033557196041093, 'subsample': 0.578328610215158, 'colsample_bytree': 0.9372263504751193, 'reg_alpha': 0.5527459887016556, 'reg_lambda': 0.9693203290216276, 'random_state': 803}. Best is trial 54 with value: 5.780154951558658.[0m
[32m[I 2022-10-22 15:16:23,223][0m Trial 59 finished with value: 6.105133791603612 and parameters: {'max_depth': 

[32m[I 2022-10-22 15:17:07,976][0m Trial 76 finished with value: 5.8227266625244525 and parameters: {'max_depth': 7, 'learning_rate': 0.035225979512257366, 'n_estimators': 610, 'min_child_weight': 7, 'gamma': 0.8632536621574654, 'subsample': 0.7364099082444766, 'colsample_bytree': 0.9642006316131296, 'reg_alpha': 0.7543750757152966, 'reg_lambda': 0.8544340055019191, 'random_state': 843}. Best is trial 75 with value: 5.7739825599434065.[0m
[32m[I 2022-10-22 15:17:09,764][0m Trial 77 finished with value: 5.835138033835825 and parameters: {'max_depth': 7, 'learning_rate': 0.03392706631614008, 'n_estimators': 607, 'min_child_weight': 8, 'gamma': 0.8389771736873265, 'subsample': 0.6845361469507558, 'colsample_bytree': 0.9984938519783607, 'reg_alpha': 0.7088063271713231, 'reg_lambda': 0.8035305625971119, 'random_state': 848}. Best is trial 75 with value: 5.7739825599434065.[0m
[32m[I 2022-10-22 15:17:11,736][0m Trial 78 finished with value: 7.936210723979971 and parameters: {'max_dep

[32m[I 2022-10-22 15:17:49,243][0m Trial 95 finished with value: 5.761254379236102 and parameters: {'max_depth': 9, 'learning_rate': 0.03072043287414317, 'n_estimators': 629, 'min_child_weight': 5, 'gamma': 0.45218218507130675, 'subsample': 0.6312118385728676, 'colsample_bytree': 0.979648353016922, 'reg_alpha': 0.9497080339944705, 'reg_lambda': 0.8267562273058602, 'random_state': 796}. Best is trial 93 with value: 5.754464013986182.[0m
[32m[I 2022-10-22 15:17:52,131][0m Trial 96 finished with value: 6.021286486796611 and parameters: {'max_depth': 9, 'learning_rate': 0.1061288593184952, 'n_estimators': 700, 'min_child_weight': 5, 'gamma': 0.43608145607617543, 'subsample': 0.6299448052984474, 'colsample_bytree': 0.9999037960429937, 'reg_alpha': 0.31975528469663783, 'reg_lambda': 0.8863095307861649, 'random_state': 714}. Best is trial 93 with value: 5.754464013986182.[0m
[32m[I 2022-10-22 15:17:54,957][0m Trial 97 finished with value: 5.802743239398984 and parameters: {'max_depth'

In [49]:
print('Best value', study_1_ss_grp.best_value)

Best value 5.754464013986182


C'est à peine moins bien qu'avec la variable 'groupe', mais ce n'est pas significatif.

# Train2

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [23]:
std = StandardScaler()

In [24]:
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## XGboost

In [25]:
study_2 = optuna.create_study(direction='minimize', study_name='regression_2')

[32m[I 2022-10-18 11:11:54,236][0m A new study created in memory with name: regression_2[0m


In [26]:
study_2.optimize(objective, n_trials=100)

[32m[I 2022-10-18 11:12:33,349][0m Trial 0 finished with value: 7.938099741209807 and parameters: {'max_depth': 9, 'learning_rate': 0.01636243900761896, 'n_estimators': 922, 'min_child_weight': 6, 'gamma': 0.5248812293199041, 'subsample': 0.4155742080746963, 'colsample_bytree': 0.5729519931411882, 'reg_alpha': 0.12676899878911654, 'reg_lambda': 0.23795141761757876, 'random_state': 762}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:12:34,441][0m Trial 1 finished with value: 8.507346477412597 and parameters: {'max_depth': 1, 'learning_rate': 0.48293675113069107, 'n_estimators': 69, 'min_child_weight': 10, 'gamma': 0.6098217946540436, 'subsample': 0.31063049279172306, 'colsample_bytree': 0.38559842301063124, 'reg_alpha': 0.8278418885945431, 'reg_lambda': 0.6384916280327305, 'random_state': 518}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:12:35,687][0m Trial 2 finished with value: 10.906788186620458 and parameters: {'max_depth':

[32m[I 2022-10-18 11:18:53,970][0m Trial 19 finished with value: 33.898718563082205 and parameters: {'max_depth': 7, 'learning_rate': 0.462429101543111, 'n_estimators': 998, 'min_child_weight': 5, 'gamma': 0.4119796018875464, 'subsample': 0.23203321250271192, 'colsample_bytree': 0.866467300631279, 'reg_alpha': 0.6749307744519907, 'reg_lambda': 0.11585900527881238, 'random_state': 194}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:19:05,872][0m Trial 20 finished with value: 11.609653253180658 and parameters: {'max_depth': 2, 'learning_rate': 0.7647111503383774, 'n_estimators': 814, 'min_child_weight': 3, 'gamma': 0.7631907152137491, 'subsample': 0.422036133381956, 'colsample_bytree': 0.5100537541227164, 'reg_alpha': 0.4286563445640358, 'reg_lambda': 0.5741503794132414, 'random_state': 991}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:19:25,650][0m Trial 21 finished with value: 8.160274246504676 and parameters: {'max_depth': 7,

[32m[I 2022-10-18 11:25:47,776][0m Trial 38 finished with value: 8.123805188034337 and parameters: {'max_depth': 8, 'learning_rate': 0.06689202654468124, 'n_estimators': 230, 'min_child_weight': 10, 'gamma': 0.6614783014876277, 'subsample': 0.5810163090903219, 'colsample_bytree': 0.22511135719191697, 'reg_alpha': 0.7291441789351244, 'reg_lambda': 0.4503795289904036, 'random_state': 58}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:25:50,393][0m Trial 39 finished with value: 8.325568732023555 and parameters: {'max_depth': 6, 'learning_rate': 0.16727335126774812, 'n_estimators': 149, 'min_child_weight': 10, 'gamma': 0.732517978730217, 'subsample': 0.7477680560726249, 'colsample_bytree': 0.18863538702881857, 'reg_alpha': 0.9392014979385213, 'reg_lambda': 0.13448749185671027, 'random_state': 140}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:25:51,887][0m Trial 40 finished with value: 8.478683214740586 and parameters: {'max_depth'

[32m[I 2022-10-18 11:27:46,400][0m Trial 57 finished with value: 7.878467202028775 and parameters: {'max_depth': 10, 'learning_rate': 0.011494741369394244, 'n_estimators': 360, 'min_child_weight': 7, 'gamma': 0.04082952417735919, 'subsample': 0.30648408812264505, 'colsample_bytree': 0.64442689719172, 'reg_alpha': 0.5808069129761744, 'reg_lambda': 0.6150262433409881, 'random_state': 420}. Best is trial 55 with value: 7.851610139175669.[0m
[32m[I 2022-10-18 11:28:06,728][0m Trial 58 finished with value: 7.920663229195188 and parameters: {'max_depth': 10, 'learning_rate': 0.012697891970720838, 'n_estimators': 373, 'min_child_weight': 7, 'gamma': 0.032998329407936546, 'subsample': 0.2937742860896217, 'colsample_bytree': 0.7952699775294012, 'reg_alpha': 0.5120481661034986, 'reg_lambda': 0.6786421775097196, 'random_state': 291}. Best is trial 55 with value: 7.851610139175669.[0m
[32m[I 2022-10-18 11:28:26,885][0m Trial 59 finished with value: 3.3015683796938244e+16 and parameters: {'

[32m[I 2022-10-18 11:34:01,297][0m Trial 76 finished with value: 10.106240961429839 and parameters: {'max_depth': 10, 'learning_rate': 0.15796192881639579, 'n_estimators': 311, 'min_child_weight': 6, 'gamma': 0.19852147397664568, 'subsample': 0.20835609004789118, 'colsample_bytree': 0.604802947018276, 'reg_alpha': 0.30991942108679854, 'reg_lambda': 0.7476997504786214, 'random_state': 279}. Best is trial 71 with value: 7.845934655914427.[0m
[32m[I 2022-10-18 11:34:14,937][0m Trial 77 finished with value: 8.685231701924367 and parameters: {'max_depth': 9, 'learning_rate': 0.1879100911371729, 'n_estimators': 262, 'min_child_weight': 5, 'gamma': 0.07043133015036848, 'subsample': 0.5988286077140954, 'colsample_bytree': 0.7004156249977397, 'reg_alpha': 0.38537685785789727, 'reg_lambda': 0.5527904369607696, 'random_state': 921}. Best is trial 71 with value: 7.845934655914427.[0m
[32m[I 2022-10-18 11:34:33,687][0m Trial 78 finished with value: 8.961023353312045 and parameters: {'max_de

[32m[I 2022-10-18 11:39:56,734][0m Trial 95 finished with value: 8.190758527584546 and parameters: {'max_depth': 10, 'learning_rate': 0.07839926844608688, 'n_estimators': 238, 'min_child_weight': 8, 'gamma': 0.11153034392838804, 'subsample': 0.49236203301178094, 'colsample_bytree': 0.5636973851256997, 'reg_alpha': 0.2060799518010527, 'reg_lambda': 0.8242924772452809, 'random_state': 407}. Best is trial 92 with value: 7.789057683704832.[0m
[32m[I 2022-10-18 11:40:17,308][0m Trial 96 finished with value: 8.268164255507319 and parameters: {'max_depth': 10, 'learning_rate': 0.10916470755066335, 'n_estimators': 325, 'min_child_weight': 7, 'gamma': 0.0408024892375404, 'subsample': 0.6425317508174647, 'colsample_bytree': 0.7994342670328294, 'reg_alpha': 0.4577718711588507, 'reg_lambda': 0.7984085369736394, 'random_state': 329}. Best is trial 92 with value: 7.789057683704832.[0m
[32m[I 2022-10-18 11:40:34,201][0m Trial 97 finished with value: 7.996607050442463 and parameters: {'max_dep

In [27]:
print('Best parameters', study_2.best_params)
print('Best value', study_2.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.011938901196820607, 'n_estimators': 291, 'min_child_weight': 8, 'gamma': 0.1742812850916403, 'subsample': 0.5691026102586522, 'colsample_bytree': 0.8531643967447635, 'reg_alpha': 0.4724367703165903, 'reg_lambda': 0.7725122886280066, 'random_state': 348}
Best value 7.789057683704832


Cette fois-ci, c'est clairement moins bien qu'avant. 

# Notre meilleurs model.

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

In [76]:
std = StandardScaler()
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [50]:
b = study_1.best_params

In [91]:
model = xgb.XGBRegressor(
    max_depth=b['max_depth'],
    learning_rate=b['learning_rate'],
    n_estimators=b['n_estimators'],
    min_child_weight=b['min_child_weight'],
    gamma=b['gamma'],
    subsample=b['subsample'],
    colsample_bytree=b['colsample_bytree'],
    reg_alpha=b['reg_alpha'],
    reg_lambda=b['reg_lambda'],
    random_state=b['random_state']
            )


In [92]:
model.fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.691336594607612

In [93]:
metrics.r2_score(y_test, y_pred)

0.601134701107697

# Prédiction

In [65]:
df_test = pd.read_csv("Data/test.csv")

In [66]:
df_test

Unnamed: 0,seq_id,protein_sequence,pH,data_source
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


In [67]:
df_test['groupe'] = train1['groupe'].max() + 1

In [68]:
df_test

Unnamed: 0,seq_id,protein_sequence,pH,data_source,groupe
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,21793
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793


In [56]:
amino_count = df_test['protein_sequence'].str.split('').explode('protein_sequence').value_counts().drop('')
len(amino_count)

20

In [59]:
amino_list = list(amino_count.index)
amino_list.sort()

In [60]:
def count_amino(prot, am):
    count = len([l for l in prot if l==am])
    return count

In [69]:
for amino in amino_list:
    df_test[amino] = df_test['protein_sequence'].apply(lambda x: count_amino(x, amino))

In [70]:
df_test

Unnamed: 0,seq_id,protein_sequence,pH,data_source,groupe,A,C,D,E,F,...,M,N,P,Q,R,S,T,V,W,Y
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,22,4,15,8,10,...,0,19,17,13,3,18,8,13,6,6
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,22,4,15,7,10,...,0,19,17,13,3,18,8,13,6,6
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,21793,22,4,15,7,10,...,0,19,17,13,3,18,8,13,6,6
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,22,5,15,7,10,...,0,19,17,13,3,18,8,13,6,6
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,22,4,15,7,11,...,0,19,17,13,3,18,8,13,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,21,4,15,7,10,...,0,19,17,13,3,18,8,13,6,6
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,21,4,15,7,10,...,0,19,17,13,3,18,8,13,6,6
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,21,4,15,7,10,...,0,20,17,13,3,18,8,13,6,6
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,21793,21,4,15,7,10,...,0,19,18,13,3,18,8,13,6,6


In [71]:
df_test.drop(['seq_id', 'protein_sequence', 'data_source'], axis=1, inplace=True)

In [72]:
df_test

Unnamed: 0,pH,groupe,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,8,21793,22,4,15,8,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
1,8,21793,22,4,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
2,8,21793,22,4,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
3,8,21793,22,5,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
4,8,21793,22,4,15,7,11,19,0,6,...,0,19,17,13,3,18,8,13,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,8,21793,21,4,15,7,10,19,0,7,...,0,19,17,13,3,18,8,13,6,6
2409,8,21793,21,4,15,7,10,19,0,6,...,0,19,17,13,3,18,8,13,6,6
2410,8,21793,21,4,15,7,10,19,0,6,...,0,20,17,13,3,18,8,13,6,6
2411,8,21793,21,4,15,7,10,19,0,6,...,0,19,18,13,3,18,8,13,6,6


In [73]:
T = df_test.values

In [78]:
T_std = std.transform(T)

In [94]:
y_pred = model.predict(T_std)

In [95]:
y_pred

array([54.49808 , 54.241837, 54.31468 , ..., 54.44366 , 53.82873 ,
       54.469517], dtype=float32)

In [96]:
y_pred.shape

(2413,)