# Modélisation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import xgboost as xgb
from sklearn import dummy
from sklearn.ensemble import RandomForestRegressor

import optuna
import timeit

A la fin de l'analyse exploratoire nous avions créé deux jeux de données différents. Nous allons tester des models de gradient boosting sur ces deux jeux de données afin de comparer les deux approches. Nous testerons un model de forêts aléatoires mais le temps de calcul étant beaucoup plus long que pour le gradient boosting nous ne chercherons pas à l'améliorer.

## Chargement des données

In [2]:
train1 = pd.read_csv("Data/train_version1_for_modelisation.csv")
train2 = pd.read_csv("Data/train_version2_for_modelisation.csv")

In [3]:
train1

Unnamed: 0,seq_id,protein_sequence,pH,tm,groupe,A,C,D,E,F,...,M,N,P,Q,R,S,T,V,W,Y
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7,0,45,1,13,30,13,...,8,5,18,6,25,11,14,37,4,3
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5,1,28,0,10,52,6,...,2,6,8,22,30,14,12,13,3,3
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5,2,50,9,27,32,21,...,6,15,20,25,31,33,30,30,3,16
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2,3,20,5,19,29,12,...,2,9,16,9,10,16,19,14,3,4
4,5,AACFWRRTVIPKPPFRGISTTSARSTVMPAWVIDKYGKNEVLRFTQ...,7.0,48.4,4,33,4,16,19,16,...,11,13,19,8,16,22,25,41,10,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,31385,YYMYSGGGSALAAGGGGAGRKGDWNDIDSIKKKDLHHSRGDEKAQG...,7.0,51.8,21788,33,12,38,31,18,...,13,24,25,24,42,33,18,42,13,18
25173,31386,YYNDQHRLSSYSVETAMFLSWERAIVKPGAMFKKAVIGFNCNVDLI...,7.0,37.2,21789,37,5,21,29,22,...,14,19,19,16,25,37,26,34,5,14
25174,31387,YYQRTLGAELLYKISFGEMPKSAQDSAENCPSGMQFPDTAIAHANV...,7.0,64.6,21790,13,1,7,7,7,...,7,5,6,8,3,10,6,7,4,4
25175,31388,YYSFSDNITTVFLSRQAIDDDHSLSLGTISDVVESENGVVAADDAR...,7.0,50.7,21791,47,5,34,36,23,...,26,25,31,12,25,51,32,48,3,18


In [4]:
train2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,pH
0,1,1,1,1,9,1,1,1,10,1,...,0,0,0,0,0,0,0,0,0,7.0
1,1,1,1,3,6,4,13,10,7,12,...,0,0,0,0,0,0,0,0,0,7.0
2,1,1,1,5,16,17,13,15,1,17,...,0,0,0,0,0,0,0,0,0,7.0
3,1,1,1,16,6,10,15,17,1,8,...,0,0,0,0,0,0,0,0,0,7.0
4,1,1,2,5,19,15,15,17,18,8,...,0,0,0,0,0,0,0,0,0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25172,20,20,11,20,16,6,6,6,16,1,...,0,0,0,0,0,0,0,0,0,7.0
25173,20,20,12,3,14,7,15,10,16,16,...,0,0,0,0,0,0,0,0,0,7.0
25174,20,20,14,15,17,10,6,1,4,10,...,0,0,0,0,0,0,0,0,0,7.0
25175,20,20,16,5,16,3,12,8,17,17,...,0,0,0,0,0,0,0,0,0,7.0


In [5]:
X1 = train1.drop(['seq_id', 'protein_sequence', 'tm'], axis=1).values
X1_ss_grp = train1.drop(['seq_id', 'protein_sequence', 'tm', 'groupe'], axis=1).values
X2 = train2.values
y = train1['tm'].values

Nous testerons l'influence de la variable 'groupe' dans la première approche.

# Train1 avec la variable 'groupe'

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [7]:
std = StandardScaler()

std.fit(X_train)

X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## Dummy regressor

Dans cette section nous allons tester deux approches naïves qui nous servirons de référence pour évaluer les performances des prochains models.

In [8]:
model = dummy.DummyRegressor(strategy='mean')

In [9]:
model.fit(X_train_std, y_train)

In [10]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.855098499714646

In [11]:
model = dummy.DummyRegressor(strategy='median')

In [12]:
model.fit(X_train_std, y_train)

In [13]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

8.466706380725444

On pourrait aussi utiliser le r2_score, mais le choix de la valeur absolue de l'erreur est pertinent puisqu'on prédit une température et que l'on peut comprendre facilement ce qu'est un écart de 8 degrés.

In [14]:
metrics.r2_score(y_test, y_pred)

-0.04997271922533386

Nous allons maintenant tester des models plus développés. 

## Gradient boosting avecXGboost

Nous allons effectuer une optimisation des hyperparamètres à l'aide du module optuna.

In [15]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_test_std)
    return metrics.mean_absolute_error(y_test, y_pred)

In [16]:
study_1 = optuna.create_study(direction='minimize', study_name='regression_1')

[32m[I 2022-10-23 09:26:55,361][0m A new study created in memory with name: regression_1[0m


In [17]:
study_1.optimize(objective, n_trials=100)

[32m[I 2022-10-23 09:26:57,300][0m Trial 0 finished with value: 7.122426505893003 and parameters: {'max_depth': 2, 'learning_rate': 0.8245591464899703, 'n_estimators': 68, 'min_child_weight': 5, 'gamma': 0.7484428220783965, 'subsample': 0.4584800319968957, 'colsample_bytree': 0.6054279037926931, 'reg_alpha': 0.5193865553572158, 'reg_lambda': 0.025125395916782554, 'random_state': 408}. Best is trial 0 with value: 7.122426505893003.[0m
[32m[I 2022-10-23 09:26:58,136][0m Trial 1 finished with value: 9.769644966663934 and parameters: {'max_depth': 7, 'learning_rate': 0.3810976741632804, 'n_estimators': 373, 'min_child_weight': 10, 'gamma': 0.6449870216488266, 'subsample': 0.19755904516936992, 'colsample_bytree': 0.5507996789445156, 'reg_alpha': 0.614349493998441, 'reg_lambda': 0.6983600278054649, 'random_state': 802}. Best is trial 0 with value: 7.122426505893003.[0m
[32m[I 2022-10-23 09:26:59,127][0m Trial 2 finished with value: 6.759346920818382 and parameters: {'max_depth': 7, '

[32m[I 2022-10-23 09:27:24,167][0m Trial 19 finished with value: 6.759063457350646 and parameters: {'max_depth': 9, 'learning_rate': 0.3555694118244557, 'n_estimators': 618, 'min_child_weight': 3, 'gamma': 0.6854336217716138, 'subsample': 0.736944207925353, 'colsample_bytree': 0.4614797678404057, 'reg_alpha': 0.16666592841630323, 'reg_lambda': 0.7548497142485082, 'random_state': 670}. Best is trial 11 with value: 6.1280920453383425.[0m
[32m[I 2022-10-23 09:27:26,806][0m Trial 20 finished with value: 6.970125552463506 and parameters: {'max_depth': 10, 'learning_rate': 0.36639071828218384, 'n_estimators': 491, 'min_child_weight': 4, 'gamma': 0.47908542695604683, 'subsample': 0.5920456913428997, 'colsample_bytree': 0.7534128510137188, 'reg_alpha': 0.908430311776941, 'reg_lambda': 0.1499709593681029, 'random_state': 781}. Best is trial 11 with value: 6.1280920453383425.[0m
[32m[I 2022-10-23 09:27:28,410][0m Trial 21 finished with value: 6.272236476905263 and parameters: {'max_depth

[32m[I 2022-10-23 09:28:00,925][0m Trial 38 finished with value: 6.309901950180704 and parameters: {'max_depth': 5, 'learning_rate': 0.011584518806212238, 'n_estimators': 808, 'min_child_weight': 10, 'gamma': 0.2827119000334587, 'subsample': 0.7740377091994879, 'colsample_bytree': 0.34865140735794, 'reg_alpha': 0.796926923443009, 'reg_lambda': 0.4881731861995804, 'random_state': 55}. Best is trial 31 with value: 5.954017800908572.[0m
[32m[I 2022-10-23 09:28:01,897][0m Trial 39 finished with value: 6.582895632827916 and parameters: {'max_depth': 2, 'learning_rate': 0.11657515174001518, 'n_estimators': 944, 'min_child_weight': 8, 'gamma': 0.5400902719707052, 'subsample': 0.2390757020279486, 'colsample_bytree': 0.424498364111248, 'reg_alpha': 0.4911417538848277, 'reg_lambda': 0.3221982651757528, 'random_state': 336}. Best is trial 31 with value: 5.954017800908572.[0m
[32m[I 2022-10-23 09:28:02,335][0m Trial 40 finished with value: 8.668470271447363 and parameters: {'max_depth': 4,

[32m[I 2022-10-23 09:28:44,914][0m Trial 57 finished with value: 5.773638032181477 and parameters: {'max_depth': 9, 'learning_rate': 0.01419744533255548, 'n_estimators': 694, 'min_child_weight': 5, 'gamma': 0.03427554032259722, 'subsample': 0.6543616341520588, 'colsample_bytree': 0.8329506414614591, 'reg_alpha': 0.9953921769203689, 'reg_lambda': 0.7265740673143424, 'random_state': 537}. Best is trial 56 with value: 5.74102572364595.[0m
[32m[I 2022-10-23 09:28:48,154][0m Trial 58 finished with value: 5.780710543480253 and parameters: {'max_depth': 9, 'learning_rate': 0.01447574683316696, 'n_estimators': 589, 'min_child_weight': 5, 'gamma': 0.13110643097648034, 'subsample': 0.6148488306298897, 'colsample_bytree': 0.9844706463027719, 'reg_alpha': 0.99393737478417, 'reg_lambda': 0.7017121439331812, 'random_state': 550}. Best is trial 56 with value: 5.74102572364595.[0m
[32m[I 2022-10-23 09:28:51,910][0m Trial 59 finished with value: 5.751815480228956 and parameters: {'max_depth': 9

[32m[I 2022-10-23 09:29:52,083][0m Trial 76 finished with value: 6.083506653553158 and parameters: {'max_depth': 10, 'learning_rate': 0.1461523237914532, 'n_estimators': 648, 'min_child_weight': 1, 'gamma': 0.18529074860172923, 'subsample': 0.7085327825627492, 'colsample_bytree': 0.8973391122212583, 'reg_alpha': 0.4269777109314788, 'reg_lambda': 0.7885537276117933, 'random_state': 866}. Best is trial 56 with value: 5.74102572364595.[0m
[32m[I 2022-10-23 09:29:56,221][0m Trial 77 finished with value: 5.782502279061945 and parameters: {'max_depth': 10, 'learning_rate': 0.04124280737614312, 'n_estimators': 612, 'min_child_weight': 1, 'gamma': 0.07946811493856397, 'subsample': 0.6779098450231645, 'colsample_bytree': 0.7849168167037879, 'reg_alpha': 0.3651446044614223, 'reg_lambda': 0.9589766913763924, 'random_state': 790}. Best is trial 56 with value: 5.74102572364595.[0m
[32m[I 2022-10-23 09:29:59,224][0m Trial 78 finished with value: 6.217749764965361 and parameters: {'max_depth'

[32m[I 2022-10-23 09:30:49,955][0m Trial 95 finished with value: 5.841582124966113 and parameters: {'max_depth': 8, 'learning_rate': 0.07362330914740875, 'n_estimators': 570, 'min_child_weight': 3, 'gamma': 0.10725015101787536, 'subsample': 0.7377748743801611, 'colsample_bytree': 0.9697074599699085, 'reg_alpha': 0.05931770876425929, 'reg_lambda': 0.8565532803062519, 'random_state': 624}. Best is trial 83 with value: 5.732239425261247.[0m
[32m[I 2022-10-23 09:30:53,647][0m Trial 96 finished with value: 6.015548252674332 and parameters: {'max_depth': 9, 'learning_rate': 0.10756010460571216, 'n_estimators': 631, 'min_child_weight': 2, 'gamma': 0.20191183712885905, 'subsample': 0.6200929417120962, 'colsample_bytree': 0.944986580801082, 'reg_alpha': 0.09563485532465496, 'reg_lambda': 0.9756807482800465, 'random_state': 659}. Best is trial 83 with value: 5.732239425261247.[0m
[32m[I 2022-10-23 09:30:58,324][0m Trial 97 finished with value: 5.797618514695495 and parameters: {'max_dept

In [18]:
print('Best parameters', study_1.best_params)
print('Best value', study_1.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.04094176854479009, 'n_estimators': 477, 'min_child_weight': 3, 'gamma': 0.09492692668978472, 'subsample': 0.6910124590858816, 'colsample_bytree': 0.9511581296150486, 'reg_alpha': 0.04389846902525059, 'reg_lambda': 0.9729096738051212, 'random_state': 811}
Best value 5.732239425261247


## Random Forest

L'entraînement d'un model de forêt aléatoire avec beaucoup d'estimateurs est extrêmement long donc nous n'allons pas faire d'optimisation des hyperparamètres.

In [19]:
start_time = timeit.default_timer()

model = RandomForestRegressor(n_estimators=1000)
model.fit(X_train_std, y_train)

temps = timeit.default_timer() - start_time
temps

121.09727470000007

In [20]:
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.857045313247266

On est très proche de notre meilleur model avec XGboost.

# Train1 sans la variable 'groupe'

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X1_ss_grp, y, random_state=21, train_size=0.7, shuffle=True)

In [22]:
std = StandardScaler()
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [23]:
study_1_ss_grp = optuna.create_study(direction='minimize', study_name='regression_1_ss_grp')

[32m[I 2022-10-23 09:33:28,897][0m A new study created in memory with name: regression_1_ss_grp[0m


In [24]:
study_1_ss_grp.optimize(objective, n_trials=100)

[32m[I 2022-10-23 09:33:31,111][0m Trial 0 finished with value: 8.935264658807982 and parameters: {'max_depth': 8, 'learning_rate': 0.9126065108188105, 'n_estimators': 879, 'min_child_weight': 5, 'gamma': 0.710416809646336, 'subsample': 0.22310207879203406, 'colsample_bytree': 0.01280318810445436, 'reg_alpha': 0.12347958663075202, 'reg_lambda': 0.5392761930840902, 'random_state': 783}. Best is trial 0 with value: 8.935264658807982.[0m
[32m[I 2022-10-23 09:33:33,226][0m Trial 1 finished with value: 7.60901873454487 and parameters: {'max_depth': 9, 'learning_rate': 0.1301334973733864, 'n_estimators': 840, 'min_child_weight': 10, 'gamma': 0.07012298109033939, 'subsample': 0.11374061841942595, 'colsample_bytree': 0.8355250477673355, 'reg_alpha': 0.2863660243131425, 'reg_lambda': 0.5708022409557322, 'random_state': 335}. Best is trial 1 with value: 7.60901873454487.[0m
[32m[I 2022-10-23 09:33:34,597][0m Trial 2 finished with value: 8.639395181117058 and parameters: {'max_depth': 9, 

[32m[I 2022-10-23 09:33:49,352][0m Trial 19 finished with value: 8.14410580960032 and parameters: {'max_depth': 8, 'learning_rate': 0.5973216387150384, 'n_estimators': 195, 'min_child_weight': 8, 'gamma': 0.6889537600544483, 'subsample': 0.5925948096243883, 'colsample_bytree': 0.34443893662884484, 'reg_alpha': 0.611994865882883, 'reg_lambda': 0.4398687084526234, 'random_state': 591}. Best is trial 12 with value: 6.0543963291672185.[0m
[32m[I 2022-10-23 09:33:49,862][0m Trial 20 finished with value: 7.597313958436432 and parameters: {'max_depth': 4, 'learning_rate': 0.25749289690047783, 'n_estimators': 446, 'min_child_weight': 2, 'gamma': 0.16739740058930275, 'subsample': 0.21379354386734906, 'colsample_bytree': 0.032812014187439786, 'reg_alpha': 0.8185005532621499, 'reg_lambda': 0.277450823348653, 'random_state': 880}. Best is trial 12 with value: 6.0543963291672185.[0m
[32m[I 2022-10-23 09:33:50,042][0m Trial 21 finished with value: 6.15280353100741 and parameters: {'max_depth

[32m[I 2022-10-23 09:34:07,516][0m Trial 38 finished with value: 8.130708239890925 and parameters: {'max_depth': 9, 'learning_rate': 0.8435282989173345, 'n_estimators': 693, 'min_child_weight': 6, 'gamma': 0.7528409781197685, 'subsample': 0.8990163481751563, 'colsample_bytree': 0.8476681203928451, 'reg_alpha': 0.7890026498062321, 'reg_lambda': 0.7647189035432582, 'random_state': 500}. Best is trial 34 with value: 6.005242842153357.[0m
[32m[I 2022-10-23 09:34:10,493][0m Trial 39 finished with value: 6.846569859820571 and parameters: {'max_depth': 10, 'learning_rate': 0.36838917822818334, 'n_estimators': 762, 'min_child_weight': 5, 'gamma': 0.652932350467327, 'subsample': 0.6292860087524789, 'colsample_bytree': 0.9054325013046767, 'reg_alpha': 0.9240499635576829, 'reg_lambda': 0.5567588190357642, 'random_state': 551}. Best is trial 34 with value: 6.005242842153357.[0m
[32m[I 2022-10-23 09:34:12,211][0m Trial 40 finished with value: 6.3840084900072025 and parameters: {'max_depth':

[32m[I 2022-10-23 09:34:21,800][0m Trial 57 finished with value: 6.991006279345193 and parameters: {'max_depth': 2, 'learning_rate': 0.0428438141853594, 'n_estimators': 333, 'min_child_weight': 8, 'gamma': 0.9512160461504482, 'subsample': 0.8772509117312011, 'colsample_bytree': 0.9561404242738226, 'reg_alpha': 0.6155424425676147, 'reg_lambda': 0.8794859949839643, 'random_state': 884}. Best is trial 53 with value: 5.751232240488519.[0m
[32m[I 2022-10-23 09:34:22,900][0m Trial 58 finished with value: 6.064835561884122 and parameters: {'max_depth': 10, 'learning_rate': 0.1764671204747451, 'n_estimators': 271, 'min_child_weight': 8, 'gamma': 0.8138546382373394, 'subsample': 0.8310270633983844, 'colsample_bytree': 0.912589211938551, 'reg_alpha': 0.5528898209945143, 'reg_lambda': 0.952629588133617, 'random_state': 685}. Best is trial 53 with value: 5.751232240488519.[0m
[32m[I 2022-10-23 09:34:23,828][0m Trial 59 finished with value: 6.857462446229433 and parameters: {'max_depth': 10

[32m[I 2022-10-23 09:34:53,433][0m Trial 76 finished with value: 5.78413778501495 and parameters: {'max_depth': 9, 'learning_rate': 0.03617264501084771, 'n_estimators': 469, 'min_child_weight': 7, 'gamma': 0.7813472395707663, 'subsample': 0.9209041039929974, 'colsample_bytree': 0.8554824411913657, 'reg_alpha': 0.040028002692901646, 'reg_lambda': 0.9995959650513419, 'random_state': 937}. Best is trial 62 with value: 5.748711232606274.[0m
[32m[I 2022-10-23 09:34:54,135][0m Trial 77 finished with value: 6.672500317457147 and parameters: {'max_depth': 9, 'learning_rate': 0.011666372817907444, 'n_estimators': 365, 'min_child_weight': 8, 'gamma': 0.8430291892546645, 'subsample': 0.021640836451210343, 'colsample_bytree': 0.8888378926547751, 'reg_alpha': 0.13663944862325253, 'reg_lambda': 0.7724413847463572, 'random_state': 861}. Best is trial 62 with value: 5.748711232606274.[0m
[32m[I 2022-10-23 09:34:56,092][0m Trial 78 finished with value: 7.841797658543061 and parameters: {'max_de

[32m[I 2022-10-23 09:35:25,129][0m Trial 95 finished with value: 6.447948510349507 and parameters: {'max_depth': 3, 'learning_rate': 0.03333803944439857, 'n_estimators': 631, 'min_child_weight': 7, 'gamma': 0.6672433651363001, 'subsample': 0.7702567791883881, 'colsample_bytree': 0.9512146771632592, 'reg_alpha': 0.06309351092801718, 'reg_lambda': 0.8791228249074617, 'random_state': 678}. Best is trial 83 with value: 5.735747420361727.[0m
[32m[I 2022-10-23 09:35:27,341][0m Trial 96 finished with value: 5.821474643633548 and parameters: {'max_depth': 10, 'learning_rate': 0.010768055720848022, 'n_estimators': 465, 'min_child_weight': 8, 'gamma': 0.8781794654482272, 'subsample': 0.930933759510261, 'colsample_bytree': 0.8385473046765659, 'reg_alpha': 0.20743271168129163, 'reg_lambda': 0.7612463622548726, 'random_state': 866}. Best is trial 83 with value: 5.735747420361727.[0m
[32m[I 2022-10-23 09:35:29,068][0m Trial 97 finished with value: 5.902201142032089 and parameters: {'max_dept

In [25]:
print('Best value', study_1_ss_grp.best_value)

Best value 5.735747420361727


C'est à peine moins bien qu'avec la variable 'groupe', mais ce n'est pas significatif.

# Train2

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=21, train_size=0.7, shuffle=True)

## Preprocessing

In [27]:
std = StandardScaler()

In [28]:
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

## XGboost

In [29]:
study_2 = optuna.create_study(direction='minimize', study_name='regression_2')

[32m[I 2022-10-23 09:36:02,098][0m A new study created in memory with name: regression_2[0m


In [26]:
study_2.optimize(objective, n_trials=100)

[32m[I 2022-10-18 11:12:33,349][0m Trial 0 finished with value: 7.938099741209807 and parameters: {'max_depth': 9, 'learning_rate': 0.01636243900761896, 'n_estimators': 922, 'min_child_weight': 6, 'gamma': 0.5248812293199041, 'subsample': 0.4155742080746963, 'colsample_bytree': 0.5729519931411882, 'reg_alpha': 0.12676899878911654, 'reg_lambda': 0.23795141761757876, 'random_state': 762}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:12:34,441][0m Trial 1 finished with value: 8.507346477412597 and parameters: {'max_depth': 1, 'learning_rate': 0.48293675113069107, 'n_estimators': 69, 'min_child_weight': 10, 'gamma': 0.6098217946540436, 'subsample': 0.31063049279172306, 'colsample_bytree': 0.38559842301063124, 'reg_alpha': 0.8278418885945431, 'reg_lambda': 0.6384916280327305, 'random_state': 518}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:12:35,687][0m Trial 2 finished with value: 10.906788186620458 and parameters: {'max_depth':

[32m[I 2022-10-18 11:18:53,970][0m Trial 19 finished with value: 33.898718563082205 and parameters: {'max_depth': 7, 'learning_rate': 0.462429101543111, 'n_estimators': 998, 'min_child_weight': 5, 'gamma': 0.4119796018875464, 'subsample': 0.23203321250271192, 'colsample_bytree': 0.866467300631279, 'reg_alpha': 0.6749307744519907, 'reg_lambda': 0.11585900527881238, 'random_state': 194}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:19:05,872][0m Trial 20 finished with value: 11.609653253180658 and parameters: {'max_depth': 2, 'learning_rate': 0.7647111503383774, 'n_estimators': 814, 'min_child_weight': 3, 'gamma': 0.7631907152137491, 'subsample': 0.422036133381956, 'colsample_bytree': 0.5100537541227164, 'reg_alpha': 0.4286563445640358, 'reg_lambda': 0.5741503794132414, 'random_state': 991}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:19:25,650][0m Trial 21 finished with value: 8.160274246504676 and parameters: {'max_depth': 7,

[32m[I 2022-10-18 11:25:47,776][0m Trial 38 finished with value: 8.123805188034337 and parameters: {'max_depth': 8, 'learning_rate': 0.06689202654468124, 'n_estimators': 230, 'min_child_weight': 10, 'gamma': 0.6614783014876277, 'subsample': 0.5810163090903219, 'colsample_bytree': 0.22511135719191697, 'reg_alpha': 0.7291441789351244, 'reg_lambda': 0.4503795289904036, 'random_state': 58}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:25:50,393][0m Trial 39 finished with value: 8.325568732023555 and parameters: {'max_depth': 6, 'learning_rate': 0.16727335126774812, 'n_estimators': 149, 'min_child_weight': 10, 'gamma': 0.732517978730217, 'subsample': 0.7477680560726249, 'colsample_bytree': 0.18863538702881857, 'reg_alpha': 0.9392014979385213, 'reg_lambda': 0.13448749185671027, 'random_state': 140}. Best is trial 0 with value: 7.938099741209807.[0m
[32m[I 2022-10-18 11:25:51,887][0m Trial 40 finished with value: 8.478683214740586 and parameters: {'max_depth'

[32m[I 2022-10-18 11:27:46,400][0m Trial 57 finished with value: 7.878467202028775 and parameters: {'max_depth': 10, 'learning_rate': 0.011494741369394244, 'n_estimators': 360, 'min_child_weight': 7, 'gamma': 0.04082952417735919, 'subsample': 0.30648408812264505, 'colsample_bytree': 0.64442689719172, 'reg_alpha': 0.5808069129761744, 'reg_lambda': 0.6150262433409881, 'random_state': 420}. Best is trial 55 with value: 7.851610139175669.[0m
[32m[I 2022-10-18 11:28:06,728][0m Trial 58 finished with value: 7.920663229195188 and parameters: {'max_depth': 10, 'learning_rate': 0.012697891970720838, 'n_estimators': 373, 'min_child_weight': 7, 'gamma': 0.032998329407936546, 'subsample': 0.2937742860896217, 'colsample_bytree': 0.7952699775294012, 'reg_alpha': 0.5120481661034986, 'reg_lambda': 0.6786421775097196, 'random_state': 291}. Best is trial 55 with value: 7.851610139175669.[0m
[32m[I 2022-10-18 11:28:26,885][0m Trial 59 finished with value: 3.3015683796938244e+16 and parameters: {'

[32m[I 2022-10-18 11:34:01,297][0m Trial 76 finished with value: 10.106240961429839 and parameters: {'max_depth': 10, 'learning_rate': 0.15796192881639579, 'n_estimators': 311, 'min_child_weight': 6, 'gamma': 0.19852147397664568, 'subsample': 0.20835609004789118, 'colsample_bytree': 0.604802947018276, 'reg_alpha': 0.30991942108679854, 'reg_lambda': 0.7476997504786214, 'random_state': 279}. Best is trial 71 with value: 7.845934655914427.[0m
[32m[I 2022-10-18 11:34:14,937][0m Trial 77 finished with value: 8.685231701924367 and parameters: {'max_depth': 9, 'learning_rate': 0.1879100911371729, 'n_estimators': 262, 'min_child_weight': 5, 'gamma': 0.07043133015036848, 'subsample': 0.5988286077140954, 'colsample_bytree': 0.7004156249977397, 'reg_alpha': 0.38537685785789727, 'reg_lambda': 0.5527904369607696, 'random_state': 921}. Best is trial 71 with value: 7.845934655914427.[0m
[32m[I 2022-10-18 11:34:33,687][0m Trial 78 finished with value: 8.961023353312045 and parameters: {'max_de

[32m[I 2022-10-18 11:39:56,734][0m Trial 95 finished with value: 8.190758527584546 and parameters: {'max_depth': 10, 'learning_rate': 0.07839926844608688, 'n_estimators': 238, 'min_child_weight': 8, 'gamma': 0.11153034392838804, 'subsample': 0.49236203301178094, 'colsample_bytree': 0.5636973851256997, 'reg_alpha': 0.2060799518010527, 'reg_lambda': 0.8242924772452809, 'random_state': 407}. Best is trial 92 with value: 7.789057683704832.[0m
[32m[I 2022-10-18 11:40:17,308][0m Trial 96 finished with value: 8.268164255507319 and parameters: {'max_depth': 10, 'learning_rate': 0.10916470755066335, 'n_estimators': 325, 'min_child_weight': 7, 'gamma': 0.0408024892375404, 'subsample': 0.6425317508174647, 'colsample_bytree': 0.7994342670328294, 'reg_alpha': 0.4577718711588507, 'reg_lambda': 0.7984085369736394, 'random_state': 329}. Best is trial 92 with value: 7.789057683704832.[0m
[32m[I 2022-10-18 11:40:34,201][0m Trial 97 finished with value: 7.996607050442463 and parameters: {'max_dep

In [27]:
print('Best parameters', study_2.best_params)
print('Best value', study_2.best_value)

Best parameters {'max_depth': 10, 'learning_rate': 0.011938901196820607, 'n_estimators': 291, 'min_child_weight': 8, 'gamma': 0.1742812850916403, 'subsample': 0.5691026102586522, 'colsample_bytree': 0.8531643967447635, 'reg_alpha': 0.4724367703165903, 'reg_lambda': 0.7725122886280066, 'random_state': 348}
Best value 7.789057683704832


Cette fois-ci, c'est clairement moins bien qu'avant. 

# Notre meilleurs model.

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=21, train_size=0.7, shuffle=True)

In [31]:
std = StandardScaler()
std.fit(X_train)
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

In [32]:
b = study_1.best_params

In [33]:
model = xgb.XGBRegressor(
    max_depth=b['max_depth'],
    learning_rate=b['learning_rate'],
    n_estimators=b['n_estimators'],
    min_child_weight=b['min_child_weight'],
    gamma=b['gamma'],
    subsample=b['subsample'],
    colsample_bytree=b['colsample_bytree'],
    reg_alpha=b['reg_alpha'],
    reg_lambda=b['reg_lambda'],
    random_state=b['random_state']
            )


In [34]:
model.fit(X_train_std, y_train)
y_pred = model.predict(X_test_std)
metrics.mean_absolute_error(y_test, y_pred)

5.732239425261247

In [35]:
metrics.r2_score(y_test, y_pred)

0.5950347852559775

# Prédiction

In [37]:
df_test = pd.read_csv("Data/test_1_for_modelisation.csv")
sub = pd.read_csv("Data/sample_submission.csv")

In [38]:
df_test

Unnamed: 0,seq_id,protein_sequence,pH,groupe,A,C,D,E,F,G,...,M,N,P,Q,R,S,T,V,W,Y
0,31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,4,15,8,10,19,...,0,19,17,13,3,18,8,13,6,6
1,31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
2,31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,21793,22,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
3,31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,5,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
4,31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,22,4,15,7,11,19,...,0,19,17,13,3,18,8,13,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408,33798,VPVNPEPDATSVENVILKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
2409,33799,VPVNPEPDATSVENVLLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,19,17,13,3,18,8,13,6,6
2410,33800,VPVNPEPDATSVENVNLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,20,17,13,3,18,8,13,6,6
2411,33801,VPVNPEPDATSVENVPLKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,21793,21,4,15,7,10,19,...,0,19,18,13,3,18,8,13,6,6


In [41]:
T = df_test.drop(['seq_id','protein_sequence'], axis=1).values

In [42]:
T = std.transform(T)

In [43]:
y_pred = model.predict(T)

In [44]:
y_pred

array([55.837357, 55.854015, 55.69234 , ..., 57.027264, 55.704838,
       56.453148], dtype=float32)

In [45]:
sub

Unnamed: 0,seq_id,tm
0,31390,0
1,31391,1
2,31392,2
3,31393,3
4,31394,4
...,...,...
2408,33798,2408
2409,33799,2409
2410,33800,2410
2411,33801,2411


In [46]:
sub['tm'] = y_pred.T

In [47]:
sub

Unnamed: 0,seq_id,tm
0,31390,55.837357
1,31391,55.854015
2,31392,55.692341
3,31393,55.915981
4,31394,55.663597
...,...,...
2408,33798,56.334030
2409,33799,56.501278
2410,33800,57.027264
2411,33801,55.704838


In [49]:
sub.to_csv("Submission/sample_submission.csv", index=False)