# Modélisation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import xgboost as xgb
from sklearn import dummy
from sklearn.ensemble import RandomForestRegressor

import optuna
import timeit

A la fin de l'analyse exploratoire nous avions créé deux jeux de données différents. Nous allons tester des models de gradient boosting sur ces deux jeux de données afin de comparer les deux approches. Nous testerons un model de forêts aléatoires mais le temps de calcul étant beaucoup plus long que pour le gradient boosting nous ne chercherons pas à l'améliorer.

## Chargement des données

In [2]:
train1 = pd.read_csv("Data/train_version1_for_modelisation.csv")
train2 = pd.read_csv("Data/train_version2_for_modelisation.csv")

In [3]:
train1

Unnamed: 0,seq_id,protein_sequence,pH,tm,groupe,A,C,D,E,F,...,M,N,P,Q,R,S,T,V,W,Y
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,0,45,1,13,30,13,...,8,5,18,6,25,11,14,37,4,3
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,1,28,0,10,52,6,...,2,6,8,22,30,14,12,13,3,3
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,2,50,9,27,32,21,...,6,15,20,25,31,33,30,30,3,16
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,3,20,5,19,29,12,...,2,9,16,9,10,16,19,14,3,4
4,5,AACFWRRTVIPKPPFRGISTTSARSTVMPAWVIDKYGKNEVLRFTQ...,7.0,48.4,4,33,4,16,19,16,...,11,13,19,8,16,22,25,41,10,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,21788,33,12,38,31,18,...,13,24,25,24,42,33,18,42,13,18
25173,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,21789,37,5,21,29,22,...,14,19,19,16,25,37,26,34,5,14
25174,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,21790,13,1,7,7,7,...,7,5,6,8,3,10,6,7,4,4
25175,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,21791,47,5,34,36,23,...,26,25,31,12,25,51,32,48,3,18


In [4]:
train2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,pH
0,1,1,1,1,9,1,1,1,10,1,...,0,0,0,0,0,0,0,0,0,7.0
1,1,1,1,3,6,4,13,10,7,12,...,0,0,0,0,0,0,0,0,0,7.0
2,1,1,1,5,16,17,13,15,1,17,...,0,0,0,0,0,0,0,0,0,7.0
3,1,1,1,16,6,10,15,17,1,8,...,0,0,0,0,0,0,0,0,0,7.0
4,1,1,2,5,19,15,15,17,18,8,...,0,0,0,0,0,0,0,0,0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,20,20,11,20,16,6,6,6,16,1,...,0,0,0,0,0,0,0,0,0,7.0
25173,20,20,12,3,14,7,15,10,16,16,...,0,0,0,0,0,0,0,0,0,7.0
25174,20,20,14,15,17,10,6,1,4,10,...,0,0,0,0,0,0,0,0,0,7.0
25175,20,20,16,5,16,3,12,8,17,17,...,0,0,0,0,0,0,0,0,0,7.0


In [5]:
X1 = train1.drop(['seq_id', 'protein_sequence', 'tm'], axis=1).values
X1_ss_grp = train1.drop(['seq_id', 'protein_sequence', 'tm', 'groupe'], axis=1).values
X2 = train2.values
y = train1['tm'].values

Nous testerons l'influence de la variable 'groupe' dans la première approche.

# Train1 avec la variable 'groupe'

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [7]:
std = StandardScaler()

std.fit(X_train)

X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## Dummy regressor

Dans cette section nous allons tester deux approches naïves qui nous servirons de référence pour évaluer les performances des prochains models.

In [8]:
model = dummy.DummyRegressor(strategy='mean')

In [9]:
model.fit(X_train_std, y_train)

In [10]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.855098499714646

In [11]:
model = dummy.DummyRegressor(strategy='median')

In [12]:
model.fit(X_train_std, y_train)

In [13]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.466706380725444

On pourrait aussi utiliser le r2_score, mais le choix de la valeur absolue de l'erreur est pertinent puisqu'on prédit une température et que l'on peut comprendre facilement ce qu'est un écart de 8 degrés.

In [14]:
metrics.r2_score(y_test, y_pred)

-0.04997271922533386

Nous allons maintenant tester des models plus développés. 

## Gradient boosting avecXGboost

Nous allons effectuer une optimisation des hyperparamètres à l'aide du module optuna.

In [15]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_test_std)
    return metrics.mean_absolute_error(y_test, y_pred)

In [16]:
study_1 = optuna.create_study(direction='minimize', study_name='regression_1')

[32m[I 2022-10-23 16:05:56,973][0m A new study created in memory with name: regression_1[0m


In [17]:
study_1.optimize(objective, n_trials=100)

[32m[I 2022-10-23 16:05:58,116][0m Trial 0 finished with value: 120825925819.33704 and parameters: {'max_depth': 6, 'learning_rate': 0.49464586225209733, 'n_estimators': 749, 'min_child_weight': 2, 'gamma': 0.3085991024142235, 'subsample': 0.02781834485529485, 'colsample_bytree': 0.471252137273836, 'reg_alpha': 0.6381855562163125, 'reg_lambda': 0.015714241741763932, 'random_state': 594}. Best is trial 0 with value: 120825925819.33704.[0m
[32m[I 2022-10-23 16:05:59,798][0m Trial 1 finished with value: 103.79404312733277 and parameters: {'max_depth': 6, 'learning_rate': 0.4245319245659182, 'n_estimators': 989, 'min_child_weight': 1, 'gamma': 0.36984176005495367, 'subsample': 0.1103300940671228, 'colsample_bytree': 0.6325261816863769, 'reg_alpha': 0.7384956692804765, 'reg_lambda': 0.4793455584065609, 'random_state': 623}. Best is trial 1 with value: 103.79404312733277.[0m
[32m[I 2022-10-23 16:06:01,135][0m Trial 2 finished with value: 6.982835087360829 and parameters: {'max_depth'

[32m[I 2022-10-23 16:06:20,610][0m Trial 19 finished with value: 6.059157478881948 and parameters: {'max_depth': 8, 'learning_rate': 0.1408002341364427, 'n_estimators': 445, 'min_child_weight': 6, 'gamma': 0.7059908257032123, 'subsample': 0.8717557154301627, 'colsample_bytree': 0.4088573837581739, 'reg_alpha': 0.537099816522898, 'reg_lambda': 0.8472492595455774, 'random_state': 800}. Best is trial 15 with value: 5.945350239293929.[0m
[32m[I 2022-10-23 16:06:22,885][0m Trial 20 finished with value: 6.5955582073576275 and parameters: {'max_depth': 7, 'learning_rate': 0.2965562294557336, 'n_estimators': 657, 'min_child_weight': 8, 'gamma': 0.3592730321518579, 'subsample': 0.610019534419022, 'colsample_bytree': 0.6323055277269723, 'reg_alpha': 0.1835595342711394, 'reg_lambda': 0.7031187691660006, 'random_state': 358}. Best is trial 15 with value: 5.945350239293929.[0m
[32m[I 2022-10-23 16:06:24,732][0m Trial 21 finished with value: 5.956944144766325 and parameters: {'max_depth': 9,

[32m[I 2022-10-23 16:06:55,195][0m Trial 38 finished with value: 6.085078773872228 and parameters: {'max_depth': 5, 'learning_rate': 0.06641080263324249, 'n_estimators': 327, 'min_child_weight': 4, 'gamma': 0.2755391604350415, 'subsample': 0.8042581260996149, 'colsample_bytree': 0.522406053818833, 'reg_alpha': 0.23784558048304258, 'reg_lambda': 0.8407807075102509, 'random_state': 729}. Best is trial 32 with value: 5.865892992532219.[0m
[32m[I 2022-10-23 16:06:58,323][0m Trial 39 finished with value: 6.087312802543771 and parameters: {'max_depth': 8, 'learning_rate': 0.15362828602339135, 'n_estimators': 774, 'min_child_weight': 8, 'gamma': 0.3527772264044766, 'subsample': 0.7417215583218149, 'colsample_bytree': 0.6089131511909394, 'reg_alpha': 0.4679183102091117, 'reg_lambda': 0.1587162809741932, 'random_state': 631}. Best is trial 32 with value: 5.865892992532219.[0m
[32m[I 2022-10-23 16:07:01,615][0m Trial 40 finished with value: 6.517507231977209 and parameters: {'max_depth':

[32m[I 2022-10-23 16:08:05,896][0m Trial 57 finished with value: 14.841477049809855 and parameters: {'max_depth': 9, 'learning_rate': 0.8595150528867395, 'n_estimators': 940, 'min_child_weight': 4, 'gamma': 0.10773704799703737, 'subsample': 0.4723162406199824, 'colsample_bytree': 0.5909082358277541, 'reg_alpha': 0.8743950202245159, 'reg_lambda': 0.5526340362833536, 'random_state': 462}. Best is trial 49 with value: 5.7932856817550995.[0m
[32m[I 2022-10-23 16:08:09,985][0m Trial 58 finished with value: 6.05150776894155 and parameters: {'max_depth': 10, 'learning_rate': 0.13803477829962232, 'n_estimators': 693, 'min_child_weight': 2, 'gamma': 0.04219117929939896, 'subsample': 0.6432783224498969, 'colsample_bytree': 0.6387767573995802, 'reg_alpha': 0.7251701588700408, 'reg_lambda': 0.4212967450690249, 'random_state': 510}. Best is trial 49 with value: 5.7932856817550995.[0m
[32m[I 2022-10-23 16:08:13,584][0m Trial 59 finished with value: 5.873482598973104 and parameters: {'max_dep

[32m[I 2022-10-23 16:09:15,087][0m Trial 76 finished with value: 6.252704213086593 and parameters: {'max_depth': 8, 'learning_rate': 0.15872997860081411, 'n_estimators': 776, 'min_child_weight': 10, 'gamma': 0.01921253853552259, 'subsample': 0.5462980175172484, 'colsample_bytree': 0.5367628289766295, 'reg_alpha': 0.7623906514009725, 'reg_lambda': 0.2614917546538472, 'random_state': 329}. Best is trial 49 with value: 5.7932856817550995.[0m
[32m[I 2022-10-23 16:09:17,733][0m Trial 77 finished with value: 6.166373234840496 and parameters: {'max_depth': 7, 'learning_rate': 0.10516329947625634, 'n_estimators': 881, 'min_child_weight': 2, 'gamma': 0.15542307766839325, 'subsample': 0.49538504376469167, 'colsample_bytree': 0.3527134586609947, 'reg_alpha': 0.633899203557443, 'reg_lambda': 0.417944911184173, 'random_state': 377}. Best is trial 49 with value: 5.7932856817550995.[0m
[32m[I 2022-10-23 16:09:20,250][0m Trial 78 finished with value: 5.9288812780241384 and parameters: {'max_de

[32m[I 2022-10-23 16:10:43,480][0m Trial 95 finished with value: 5.75798208096812 and parameters: {'max_depth': 10, 'learning_rate': 0.02532292123210496, 'n_estimators': 755, 'min_child_weight': 8, 'gamma': 0.14666929546634522, 'subsample': 0.7745406099024555, 'colsample_bytree': 0.7523175814443331, 'reg_alpha': 0.8946809174039531, 'reg_lambda': 0.9850478461990089, 'random_state': 415}. Best is trial 94 with value: 5.75034422411122.[0m
[32m[I 2022-10-23 16:10:47,585][0m Trial 96 finished with value: 5.758362557732116 and parameters: {'max_depth': 9, 'learning_rate': 0.028488194197952198, 'n_estimators': 752, 'min_child_weight': 8, 'gamma': 0.1485313040773671, 'subsample': 0.7874509779154346, 'colsample_bytree': 0.7228312899186731, 'reg_alpha': 0.914324326903792, 'reg_lambda': 0.8714187277612706, 'random_state': 411}. Best is trial 94 with value: 5.75034422411122.[0m
[32m[I 2022-10-23 16:10:51,467][0m Trial 97 finished with value: 5.772558643643048 and parameters: {'max_depth': 

In [18]:
print('Best parameters', study_1.best_params)
print('Best value', study_1.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.03633792997692096, 'n_estimators': 760, 'min_child_weight': 8, 'gamma': 0.08093981268804826, 'subsample': 0.8802831792968163, 'colsample_bytree': 0.7329848886033581, 'reg_alpha': 0.7753568737639309, 'reg_lambda': 0.9767886841422649, 'random_state': 501}
Best value 5.75034422411122


## Random Forest

L'entraînement d'un model de forêt aléatoire avec beaucoup d'estimateurs est extrêmement long donc nous n'allons pas faire d'optimisation des hyperparamètres.

In [19]:
start_time = timeit.default_timer()

model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train_std, y_train)

temps = timeit.default_timer() - start_time
temps

125.41653020000001

In [20]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.863059024572492

On est très proche de notre meilleur model avec XGboost.

# Train1 sans la variable 'groupe'

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X1_ss_grp, y, random_state=21, train_size=0.7, shuffle=True)

In [22]:
std = StandardScaler()

std.fit(X_train)

X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [23]:
study_1_ss_grp = optuna.create_study(direction='minimize', study_name='regression_1_ss_grp')

[32m[I 2022-10-23 16:13:06,333][0m A new study created in memory with name: regression_1_ss_grp[0m


In [24]:
study_1_ss_grp.optimize(objective, n_trials=100)

[32m[I 2022-10-23 16:13:07,214][0m Trial 0 finished with value: 1339.350559442574 and parameters: {'max_depth': 5, 'learning_rate': 0.7833064782560782, 'n_estimators': 594, 'min_child_weight': 6, 'gamma': 0.9696537180303367, 'subsample': 0.1602999804133418, 'colsample_bytree': 0.43868349099567644, 'reg_alpha': 0.2587340361776762, 'reg_lambda': 0.22736977141711145, 'random_state': 933}. Best is trial 0 with value: 1339.350559442574.[0m
[32m[I 2022-10-23 16:13:08,186][0m Trial 1 finished with value: 6.002567751255495 and parameters: {'max_depth': 6, 'learning_rate': 0.170130683092477, 'n_estimators': 444, 'min_child_weight': 3, 'gamma': 0.27232285148470803, 'subsample': 0.9517327229058622, 'colsample_bytree': 0.809377758612812, 'reg_alpha': 0.8736726865468698, 'reg_lambda': 0.6875517363943928, 'random_state': 303}. Best is trial 1 with value: 6.002567751255495.[0m
[32m[I 2022-10-23 16:13:08,835][0m Trial 2 finished with value: 6.897466516078134 and parameters: {'max_depth': 2, 'l

[32m[I 2022-10-23 16:13:41,036][0m Trial 19 finished with value: 6.808928348338253 and parameters: {'max_depth': 9, 'learning_rate': 0.43482135533087696, 'n_estimators': 1000, 'min_child_weight': 9, 'gamma': 0.26791474727760406, 'subsample': 0.7904822485356162, 'colsample_bytree': 0.6422562858028107, 'reg_alpha': 0.33985574806395197, 'reg_lambda': 0.7574577862560833, 'random_state': 41}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:13:42,914][0m Trial 20 finished with value: 6.220685990222207 and parameters: {'max_depth': 7, 'learning_rate': 0.26178777012068755, 'n_estimators': 735, 'min_child_weight': 7, 'gamma': 0.09310379022766285, 'subsample': 0.9940177311999638, 'colsample_bytree': 0.8805110654850373, 'reg_alpha': 0.6517877085829071, 'reg_lambda': 0.6611821652182548, 'random_state': 703}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:13:45,763][0m Trial 21 finished with value: 5.816236545410238 and parameters: {'max_dept

[32m[I 2022-10-23 16:14:29,917][0m Trial 38 finished with value: 10.26537452063486 and parameters: {'max_depth': 8, 'learning_rate': 0.9963674186395743, 'n_estimators': 342, 'min_child_weight': 7, 'gamma': 0.44580590035094086, 'subsample': 0.7211445199802013, 'colsample_bytree': 0.7268615666277938, 'reg_alpha': 0.8965954791749264, 'reg_lambda': 0.051142473425167756, 'random_state': 537}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:14:31,698][0m Trial 39 finished with value: 7.011772903500962 and parameters: {'max_depth': 7, 'learning_rate': 0.19726183543885883, 'n_estimators': 696, 'min_child_weight': 10, 'gamma': 0.1740105128652598, 'subsample': 0.24753567851169278, 'colsample_bytree': 0.9269521074572541, 'reg_alpha': 0.8342165951742553, 'reg_lambda': 0.8733665452704716, 'random_state': 352}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:14:32,635][0m Trial 40 finished with value: 7.596806055185875 and parameters: {'max_dep

[32m[I 2022-10-23 16:15:20,559][0m Trial 57 finished with value: 5.846044202213221 and parameters: {'max_depth': 7, 'learning_rate': 0.04132418379277798, 'n_estimators': 577, 'min_child_weight': 10, 'gamma': 0.5218838417253722, 'subsample': 0.8902456070081148, 'colsample_bytree': 0.9010984823294279, 'reg_alpha': 0.5417888681114791, 'reg_lambda': 0.9774631854041426, 'random_state': 609}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:15:23,831][0m Trial 58 finished with value: 5.9687898632329315 and parameters: {'max_depth': 9, 'learning_rate': 0.09532798886019316, 'n_estimators': 851, 'min_child_weight': 8, 'gamma': 0.1401188949844498, 'subsample': 0.7222256018003642, 'colsample_bytree': 0.8587848447335401, 'reg_alpha': 0.3870530291362316, 'reg_lambda': 0.7988350686854319, 'random_state': 763}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:15:26,562][0m Trial 59 finished with value: 6.650998695975484 and parameters: {'max_depth

[32m[I 2022-10-23 16:16:09,694][0m Trial 76 finished with value: 5.904795293073222 and parameters: {'max_depth': 9, 'learning_rate': 0.10286311034231119, 'n_estimators': 751, 'min_child_weight': 7, 'gamma': 0.4633844535509343, 'subsample': 0.9657358159459188, 'colsample_bytree': 0.9154467599460185, 'reg_alpha': 0.08581915694892367, 'reg_lambda': 0.8714695729437507, 'random_state': 395}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:16:12,476][0m Trial 77 finished with value: 6.440235949559979 and parameters: {'max_depth': 8, 'learning_rate': 0.2940439919363236, 'n_estimators': 850, 'min_child_weight': 6, 'gamma': 0.36484384979747936, 'subsample': 0.7689534816334536, 'colsample_bytree': 0.9470697297705903, 'reg_alpha': 0.3194058288847297, 'reg_lambda': 0.8322432554050064, 'random_state': 455}. Best is trial 12 with value: 5.746154762532681.[0m
[32m[I 2022-10-23 16:16:16,263][0m Trial 78 finished with value: 5.866497336931257 and parameters: {'max_depth'

[32m[I 2022-10-23 16:17:18,398][0m Trial 95 finished with value: 5.884808490501689 and parameters: {'max_depth': 10, 'learning_rate': 0.09137212184577374, 'n_estimators': 721, 'min_child_weight': 7, 'gamma': 0.24268922618666725, 'subsample': 0.7307501237256595, 'colsample_bytree': 0.9919921662925063, 'reg_alpha': 0.9793373635420249, 'reg_lambda': 0.9430410202948062, 'random_state': 676}. Best is trial 82 with value: 5.717266273650032.[0m
[32m[I 2022-10-23 16:17:22,455][0m Trial 96 finished with value: 5.731544544016964 and parameters: {'max_depth': 10, 'learning_rate': 0.012732165806259691, 'n_estimators': 822, 'min_child_weight': 6, 'gamma': 0.2691944223459449, 'subsample': 0.7714770545687157, 'colsample_bytree': 0.9997828077627009, 'reg_alpha': 0.9368858600452181, 'reg_lambda': 0.9796752476031467, 'random_state': 602}. Best is trial 82 with value: 5.717266273650032.[0m
[32m[I 2022-10-23 16:17:26,512][0m Trial 97 finished with value: 5.719294868148064 and parameters: {'max_dep

In [25]:
print('Best value', study_1_ss_grp.best_value)

Best value 5.717266273650032


C'est proche de ce que l'on obtient avec la variable 'groupe'. En fait les résultats varient d'une exécution à l'autre. Souvent le meilleur résultat est obtenu avec la variable 'groupe', mais parfois non.

# Train2

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [27]:
std = StandardScaler()

std.fit(X_train)

X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## XGboost

In [28]:
study_2 = optuna.create_study(direction='minimize', study_name='regression_2')

[32m[I 2022-10-23 16:17:35,342][0m A new study created in memory with name: regression_2[0m


In [29]:
study_2.optimize(objective, n_trials=100)

[32m[I 2022-10-23 16:17:43,093][0m Trial 0 finished with value: 25.323062309269268 and parameters: {'max_depth': 9, 'learning_rate': 0.815857124739983, 'n_estimators': 309, 'min_child_weight': 1, 'gamma': 0.29732166815709177, 'subsample': 0.43902575440263103, 'colsample_bytree': 0.20488601280656096, 'reg_alpha': 0.08233552012430768, 'reg_lambda': 0.4030158313832771, 'random_state': 469}. Best is trial 0 with value: 25.323062309269268.[0m
[32m[I 2022-10-23 16:17:52,445][0m Trial 1 finished with value: 9.340377420794564 and parameters: {'max_depth': 2, 'learning_rate': 0.5113407784607106, 'n_estimators': 681, 'min_child_weight': 10, 'gamma': 0.7641841217955543, 'subsample': 0.5043119821890201, 'colsample_bytree': 0.3172807511253362, 'reg_alpha': 0.5218442376277765, 'reg_lambda': 0.09264065535700654, 'random_state': 207}. Best is trial 1 with value: 9.340377420794564.[0m
[32m[I 2022-10-23 16:17:58,957][0m Trial 2 finished with value: 8.780530649346273 and parameters: {'max_depth':

[32m[I 2022-10-23 16:21:36,592][0m Trial 19 finished with value: 10.016042520635686 and parameters: {'max_depth': 10, 'learning_rate': 0.6480518334049015, 'n_estimators': 724, 'min_child_weight': 7, 'gamma': 0.6374274928363918, 'subsample': 0.9095967649049728, 'colsample_bytree': 0.16340192607723958, 'reg_alpha': 0.792040073392331, 'reg_lambda': 0.9964343443712791, 'random_state': 616}. Best is trial 16 with value: 7.947752176607478.[0m
[32m[I 2022-10-23 16:21:47,558][0m Trial 20 finished with value: 9.262076590166606 and parameters: {'max_depth': 8, 'learning_rate': 0.3728417105745252, 'n_estimators': 421, 'min_child_weight': 9, 'gamma': 0.5195349829317797, 'subsample': 0.7773616303745108, 'colsample_bytree': 0.32605253758866076, 'reg_alpha': 0.9498146092730558, 'reg_lambda': 0.5711103914677929, 'random_state': 385}. Best is trial 16 with value: 7.947752176607478.[0m
[32m[I 2022-10-23 16:22:08,874][0m Trial 21 finished with value: 7.910874288032255 and parameters: {'max_depth'

[32m[I 2022-10-23 16:27:59,630][0m Trial 38 finished with value: 8.458172452541326 and parameters: {'max_depth': 8, 'learning_rate': 0.19182401566497942, 'n_estimators': 724, 'min_child_weight': 5, 'gamma': 0.4771541134375299, 'subsample': 0.7777170938423613, 'colsample_bytree': 0.7987648752391796, 'reg_alpha': 0.422827533643088, 'reg_lambda': 0.5928405040751593, 'random_state': 572}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:28:36,198][0m Trial 39 finished with value: 11.089059817112405 and parameters: {'max_depth': 10, 'learning_rate': 0.4762742123414038, 'n_estimators': 502, 'min_child_weight': 9, 'gamma': 0.06135757801079405, 'subsample': 0.5350259055928304, 'colsample_bytree': 0.9213084354835012, 'reg_alpha': 0.5348459461576133, 'reg_lambda': 0.48984715753267927, 'random_state': 754}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:29:05,078][0m Trial 40 finished with value: 8.074881997687935 and parameters: {'max_depth

[32m[I 2022-10-23 16:34:04,900][0m Trial 57 finished with value: 7.973971675178946 and parameters: {'max_depth': 8, 'learning_rate': 0.04401306644505023, 'n_estimators': 356, 'min_child_weight': 6, 'gamma': 0.13346229671605425, 'subsample': 0.6770018418665249, 'colsample_bytree': 0.6088262807074901, 'reg_alpha': 0.11978019961305468, 'reg_lambda': 0.6463176650844165, 'random_state': 308}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:34:19,275][0m Trial 58 finished with value: 11.978178757976723 and parameters: {'max_depth': 5, 'learning_rate': 0.9796869563333881, 'n_estimators': 655, 'min_child_weight': 5, 'gamma': 0.6471408612585472, 'subsample': 0.8717095284385461, 'colsample_bytree': 0.4750602549686529, 'reg_alpha': 0.5078651071395436, 'reg_lambda': 0.7501440141135645, 'random_state': 512}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:34:37,344][0m Trial 59 finished with value: 8.379154623201797 and parameters: {'max_depth

[32m[I 2022-10-23 16:41:12,172][0m Trial 76 finished with value: 9.120046757452382 and parameters: {'max_depth': 7, 'learning_rate': 0.42457868293836654, 'n_estimators': 466, 'min_child_weight': 7, 'gamma': 0.05567817968381133, 'subsample': 0.9063119976727665, 'colsample_bytree': 0.8162383807642523, 'reg_alpha': 0.16112247690192888, 'reg_lambda': 0.5537893074851772, 'random_state': 388}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:41:42,988][0m Trial 77 finished with value: 8.081578281110703 and parameters: {'max_depth': 8, 'learning_rate': 0.07059201751432378, 'n_estimators': 525, 'min_child_weight': 7, 'gamma': 0.2965141132121014, 'subsample': 0.5820898149673014, 'colsample_bytree': 0.8961036490982147, 'reg_alpha': 0.09689236104803953, 'reg_lambda': 0.5840631933376146, 'random_state': 425}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:42:00,274][0m Trial 78 finished with value: 8.40600836008732 and parameters: {'max_depth

[32m[I 2022-10-23 16:51:00,182][0m Trial 95 finished with value: 8.038034654970298 and parameters: {'max_depth': 7, 'learning_rate': 0.08789764874438713, 'n_estimators': 766, 'min_child_weight': 5, 'gamma': 0.19216090396034904, 'subsample': 0.8817255440725023, 'colsample_bytree': 0.899799973749173, 'reg_alpha': 0.09094852951091877, 'reg_lambda': 0.096845901482072, 'random_state': 597}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:51:36,322][0m Trial 96 finished with value: 8.035570158793051 and parameters: {'max_depth': 8, 'learning_rate': 0.06099896107348876, 'n_estimators': 716, 'min_child_weight': 3, 'gamma': 0.27585129707876876, 'subsample': 0.7618117273671409, 'colsample_bytree': 0.8276113704401658, 'reg_alpha': 0.06449184986628678, 'reg_lambda': 0.17565755355538268, 'random_state': 391}. Best is trial 23 with value: 7.883011602351739.[0m
[32m[I 2022-10-23 16:52:15,691][0m Trial 97 finished with value: 7.953642399196812 and parameters: {'max_dept

In [30]:
print('Best value', study_2.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.012566912501914329, 'n_estimators': 441, 'min_child_weight': 8, 'gamma': 0.2963879202097287, 'subsample': 0.731398267699396, 'colsample_bytree': 0.7193328008372396, 'reg_alpha': 0.02038070745096826, 'reg_lambda': 0.7337685769701584, 'random_state': 435}
Best value 7.883011602351739


Cette fois-ci, c'est clairement moins bien qu'avant. 

# Choix du model.

On reconstruit le model de l'optimisation 'study_1'.

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

std = StandardScaler()

std.fit(X_train)

X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [32]:
b = study_1.best_params

In [33]:
model = xgb.XGBRegressor(
    max_depth=b['max_depth'],
    learning_rate=b['learning_rate'],
    n_estimators=b['n_estimators'],
    min_child_weight=b['min_child_weight'],
    gamma=b['gamma'],
    subsample=b['subsample'],
    colsample_bytree=b['colsample_bytree'],
    reg_alpha=b['reg_alpha'],
    reg_lambda=b['reg_lambda'],
    random_state=b['random_state']
            )


In [34]:
model.fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.75034422411122

In [35]:
metrics.r2_score(y_test, y_pred)

0.5949566597882878

# Prédiction

On charge le fichier contenant les données de test avec les features correspondant au model que nous avons choisi, ainsi que le fichier 'sample_submission.csv' ou nous alons enregistrer la prédiction.

In [36]:
df_test = pd.read_csv("Data/test_1_for_modelisation.csv")
sub = pd.read_csv("Data/sample_submission.csv")

In [37]:
df_test

Unnamed: 0,seq_id,protein_sequence,pH,groupe,A,C,D,E,F,G,...,M,N,P,Q,R,S,T,V,W,Y
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,4,15,8,10,19,...,0,19,17,13,3,18,8,13,6,6
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,21793,22,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,5,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,4,15,7,11,19,...,0,19,17,13,3,18,8,13,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,20,17,13,3,18,8,13,6,6
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,19,18,13,3,18,8,13,6,6


In [38]:
T = df_test.drop(['seq_id','protein_sequence'], axis=1).values

In [39]:
T = std.transform(T)

In [40]:
y_pred = model.predict(T)

In [41]:
y_pred

array([53.56511 , 53.831165, 53.87714 , ..., 54.53854 , 53.741753,
       54.06984 ], dtype=float32)

In [42]:
sub

Unnamed: 0,seq_id,tm
0,31390,0
1,31391,1
2,31392,2
3,31393,3
4,31394,4
...,...,...
2408,33798,2408
2409,33799,2409
2410,33800,2410
2411,33801,2411


In [43]:
sub['tm'] = y_pred.T

In [44]:
sub

Unnamed: 0,seq_id,tm
0,31390,53.565109
1,31391,53.831165
2,31392,53.877140
3,31393,52.638531
4,31394,53.646782
...,...,...
2408,33798,54.115082
2409,33799,54.023659
2410,33800,54.538540
2411,33801,53.741753


In [45]:
sub.to_csv("Submission/sample_submission.csv", index=False)