In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime as dt

from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt 
import seaborn as sb
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (20,7)
import pickle

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as  mae
from sklearn.metrics import mean_squared_error as mse

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

  from pandas import MultiIndex, Int64Index


Travailer avec la données netoyées

In [37]:
pacifica=pd.read_csv('PacificaClean.csv')

In [38]:
pacifica=pacifica.drop(columns=['Unnamed: 0'])

In [40]:
pacifica=pacifica[:30000]

In [41]:
pacifica.shape

(30000, 9)

In [42]:
y=pacifica.iloc[:,-1]
X=pacifica.drop(columns=['PH'])

Extraction des données qui ont une grande importance

**Machine learning SVR**

In [43]:
#travailer seulement sur les variables (Temperature,Salinity,TCO2,Alkalinity,Alkalinity,Pressure)
Features = X.filter(
items=
['TMP','OXYGEN','TCARBN','SILCAT', 'PHSPHT'])

**Entrainement sur tout les variables**

In [44]:
X_train, X_test, y_train, y_test = train_test_split(Features, y, test_size=0.2, random_state=42)

**Svr entrainé par tout les variables**

In [45]:
Svrparam = {
                'C': [1.1, 5.4],

                'epsilon': [0.05, 8, 0.2, 2, 7],

                'gamma': [0.7001, 0.001,1,5]
        }

In [46]:
svr = SVR()
clf = GridSearchCV(svr,Svrparam,cv=5,n_jobs=-1)
clf.fit(X_train,y_train)
modelsvr=clf.best_estimator_
ypred=modelsvr.predict(X_test)

In [47]:
SVR_R2=r2_score(y_test,ypred)
SVR_MAE=mae(y_test,ypred)
SVR_MSE=mse(y_test,ypred)
SVR_RMSE=mse(y_test,ypred,squared=False)

In [48]:
print('R2=',r2_score(y_test,ypred))
print('MAE=',mae(y_test,ypred))
print('MSE=',mse(y_test,ypred))

R2= 0.8850243193184222
MAE= 0.05588187519904231
MSE= 0.006125641469497528


In [49]:
# Save the model to a file
with open("pacifica_all_features_SVR.pkl", "wb") as file:
    pickle.dump(modelsvr, file)

Creation des combinaisons

In [60]:
temp=['TMP']
oxyg=['OXYGEN']
tCarb=['TCARBN']
silicate=['SILCAT']
phosphate=['PHSPHT']
#Création des combinaisons de quatre variables
comb4Var = combinations([temp, oxyg,tCarb, silicate,phosphate],4)
#Création des combinaisons de trois variables
comb3Var = combinations([temp, oxyg,tCarb, silicate,phosphate],3)
#Création des combinaisons de deux variables
comb2Var = combinations([temp, oxyg,tCarb, silicate,phosphate],2)
#Création des combinaisons de deux variables
comb1Var = combinations([temp, oxyg,tCarb, silicate,phosphate],1)

In [61]:
comb4v=list(comb4Var)
comb3v=list(comb3Var)
comb2v=list(comb2Var)
comb1v=list(comb1Var)

**Entrainement sur quatre variables**

In [76]:
svr_r2_4v={}
svr_mae_4v={}
svr_mse_4v={}
svr_Rmse_4v={}
for a in range(0,len(comb4v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb4v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
    modelSvr =SVR()

    grids = GridSearchCV(modelSvr,Svrparam,cv=3,n_jobs=-1)

    grids.fit(xtr,y_train)
    modelsvr=grids.best_estimator_
    modelsvr.fit(xtr,y_train)
    ypred=modelsvr.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreMae=mae(y_test,ypred)
    scoreMse=mse(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    svr_r2_4v[var]=scoreR2
    svr_mae_4v[var]=scoreMae
    svr_mse_4v[var]=scoreMse
    svr_Rmse_4v[var]=scoreRmse
    print("R2",scoreR2)
    print("Mae",scoreMae)
    print("Mse",scoreMse)
    print("Rmse",scoreRmse)
    print(var)
    pickle.dump(modelsvr,open('SVR_4v_'+var,"wb"))
    print("------")

R2 0.8849172015071058
Mae 0.05586738731081763
Mse 0.006131348461647793
Rmse 0.07830292754200058
SILCAT_TCARBN_OXYGEN_TMP_
------
R2 0.866685493884991
Mae 0.062109471322963815
Mse 0.007102692171967526
Rmse 0.08427747131925309
PHSPHT_TCARBN_OXYGEN_TMP_
------
R2 0.8648855064950408
Mae 0.062286286830081625
Mse 0.007198591385915108
Rmse 0.08484451299827885
PHSPHT_SILCAT_OXYGEN_TMP_
------
R2 0.8591343781390743
Mae 0.06239841484399041
Mse 0.007504998359501788
Rmse 0.08663139361398839
PHSPHT_SILCAT_TCARBN_TMP_
------
R2 0.879864926869293
Mae 0.05721154041204706
Mse 0.006400522106484808
Rmse 0.0800032630989812
PHSPHT_SILCAT_TCARBN_OXYGEN_
------


In [77]:
svr_r2_4v

{'SILCAT_TCARBN_OXYGEN_TMP_': 0.8849172015071058,
 'PHSPHT_TCARBN_OXYGEN_TMP_': 0.866685493884991,
 'PHSPHT_SILCAT_OXYGEN_TMP_': 0.8648855064950408,
 'PHSPHT_SILCAT_TCARBN_TMP_': 0.8591343781390743,
 'PHSPHT_SILCAT_TCARBN_OXYGEN_': 0.879864926869293}

In [78]:
svr_r2_4v['SILCAT & CARBN & YGEN & P_'] = svr_r2_4v.pop('SILCAT_TCARBN_OXYGEN_TMP_')
svr_r2_4v['PHSPHT & TCARBN & OXYGEN & TMP'] = svr_r2_4v.pop('PHSPHT_TCARBN_OXYGEN_TMP_')
svr_r2_4v['PHSPHT & SILCAT & OXYGEN & TMP'] = svr_r2_4v.pop('PHSPHT_SILCAT_OXYGEN_TMP_')
svr_r2_4v['PHSPHT & SILCAT & TCARBN & TMP'] = svr_r2_4v.pop('PHSPHT_SILCAT_TCARBN_TMP_')
svr_r2_4v['PHSPHT & SILCAT & TCARBN & OXYGEN'] = svr_r2_4v.pop('PHSPHT_SILCAT_TCARBN_OXYGEN_')

Rf_r2_4v={}
Rf_mae_4v={}
Rf_mse_4v={}
Rf_Rmse_4v=

In [79]:
Variables = list(svr_r2_4v.keys())
ScoresR2 = list(svr_r2_4v.values())
ScoreMae=list(svr_mae_4v.values())
ScoreMse=list(svr_mse_4v.values())
ScoreRmse=list(svr_Rmse_4v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [80]:
print("La meilleur combinaison pour quatre variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Mae",ScoreMae[index])
print("Mse",ScoreMse[index])
print("Rmse",ScoreRmse[index])

La meilleur combinaison pour quatre variables est: SILCAT & CARBN & YGEN & P_
R2 0.8849172015071058
Mae 0.05586738731081763
Mse 0.006131348461647793
Rmse 0.07830292754200058


**ENTRAINEMENT UTILISANT 3 VARIABLES**

In [64]:
svr_r2_3v={}
svr_mae_3v={}
svr_mse_3v={}
svr_Rmse_3v={}
for a in range(0,len(comb3v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb3v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
    modelSvr =SVR()

    grids = GridSearchCV(modelSvr,Svrparam,cv=3,n_jobs=-1)
    grids.fit(xtr,y_train)
    modelsvr=grids.best_estimator_
    modelsvr.fit(xtr,y_train)
    ypred=modelsvr.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreMae=mae(y_test,ypred)
    scoreMse=mse(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    svr_r2_3v[var]=scoreR2
    svr_mae_3v[var]=scoreMae
    svr_mse_3v[var]=scoreMse
    svr_Rmse_3v[var]=scoreRmse
    print("R2",scoreR2)
    print("Mae",scoreMae)
    print("Mse",scoreMse)
    print("Rmse",scoreRmse)
    print(var)
    pickle.dump(modelsvr,open('SVR_3v_'+var,"wb"))
    print("------")

R2 0.8634528957789219
Mae 0.0629764827301779
Mse 0.007274917610385205
Rmse 0.08529312756831704
TCARBN_OXYGEN_TMP_
------
R2 0.8637964306432098
Mae 0.06252118094655129
Mse 0.007256614858025533
Rmse 0.08518576675727896
SILCAT_OXYGEN_TMP_
------
R2 0.8406397975654665
Mae 0.06830920440362374
Mse 0.008490347339834533
Rmse 0.09214308080281738
PHSPHT_OXYGEN_TMP_
------
R2 0.852083675564397
Mae 0.06385970834056467
Mse 0.00788064367705508
Rmse 0.08877298956921006
SILCAT_TCARBN_TMP_
------
R2 0.8357652176499581
Mae 0.06760256259070017
Mse 0.008750053816019824
Rmse 0.09354172232763208
PHSPHT_TCARBN_TMP_
------
R2 0.8547410734082667
Mae 0.06315335523935046
Mse 0.007739063593885634
Rmse 0.08797194776680595
PHSPHT_SILCAT_TMP_
------
R2 0.8786777603610303
Mae 0.057558561757849715
Mse 0.006463771624566397
Rmse 0.08039758469361127
SILCAT_TCARBN_OXYGEN_
------
R2 0.8542891538439665
Mae 0.06553353355231331
Mse 0.007763140835329592
Rmse 0.08810868762687135
PHSPHT_TCARBN_OXYGEN_
------
R2 0.858911084688237

In [81]:
svr_r2_3v

{'TCARBN_OXYGEN_TMP_': 0.8634528957789219,
 'SILCAT_OXYGEN_TMP_': 0.8637964306432098,
 'PHSPHT_OXYGEN_TMP_': 0.8406397975654665,
 'SILCAT_TCARBN_TMP_': 0.852083675564397,
 'PHSPHT_TCARBN_TMP_': 0.8357652176499581,
 'PHSPHT_SILCAT_TMP_': 0.8547410734082667,
 'SILCAT_TCARBN_OXYGEN_': 0.8786777603610303,
 'PHSPHT_TCARBN_OXYGEN_': 0.8542891538439665,
 'PHSPHT_SILCAT_OXYGEN_': 0.8589110846882371,
 'PHSPHT_SILCAT_TCARBN_': 0.8444533933813246}

In [84]:
svr_r2_3v['TCARBN & OXYGEN & TMP'] = svr_r2_3v.pop('TCARBN & OXYGEN & TMP')
svr_r2_3v['SILCAT & OXYGEN & TMP'] = svr_r2_3v.pop('SILCAT_OXYGEN_TMP_')
svr_r2_3v['PHSPHT & OXYGEN & TMP'] = svr_r2_3v.pop('PHSPHT_OXYGEN_TMP_')
svr_r2_3v['SILCAT & TCARBN & TMP'] = svr_r2_3v.pop('SILCAT_TCARBN_TMP_')
svr_r2_3v['PHSPHT & TCARBN & TMP'] = svr_r2_3v.pop('PHSPHT_TCARBN_TMP_')
svr_r2_3v['PHSPHT & SILCAT & TMP'] = svr_r2_3v.pop('PHSPHT_SILCAT_TMP_')
svr_r2_3v['SILCAT & TCARBN & OXYGEN'] = svr_r2_3v.pop('SILCAT_TCARBN_OXYGEN_')
svr_r2_3v['PHSPHT & TCARBN & OXYGEN'] = svr_r2_3v.pop('PHSPHT_TCARBN_OXYGEN_')
svr_r2_3v['PHSPHT & SILCAT & OXYGEN'] = svr_r2_3v.pop('PHSPHT_SILCAT_OXYGEN_')
svr_r2_3v['PHSPHT & SILCAT & TCARBN_'] = svr_r2_3v.pop('PHSPHT_SILCAT_TCARBN_')

In [85]:
Variables = list(svr_r2_3v.keys())
ScoresR2 = list(svr_r2_3v.values())
ScoreMae=list(svr_mae_3v.values())
ScoreMse=list(svr_mse_3v.values())
ScoreRmse=list(svr_Rmse_3v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [86]:
print("La meilleur combinaison pour trois variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Mae",ScoreMae[index])
print("Mse",ScoreMse[index])
print("Rmse",ScoreRmse[index])

La meilleur combinaison pour trois variables est: SILCAT & TCARBN & OXYGEN
R2 0.8786777603610303
Mae 0.057558561757849715
Mse 0.006463771624566397
Rmse 0.08039758469361127


**Entrainement utilisant deux variables**

In [89]:
svr_r2_2v={}
svr_mae_2v={}
svr_mse_2v={}
svr_Rmse_2v={}
for a in range(0,len(comb2v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb2v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
    modelSvr =SVR()

    grids = GridSearchCV(modelSvr,Svrparam,cv=3,n_jobs=-1)
    grids.fit(xtr,y_train)
    modelsvr=grids.best_estimator_
    modelsvr.fit(xtr,y_train)
    ypred=modelsvr.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreMae=mae(y_test,ypred)
    scoreMse=mse(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    svr_r2_2v[var]=scoreR2
    svr_mae_2v[var]=scoreMae
    svr_mse_2v[var]=scoreMse
    svr_Rmse_2v[var]=scoreRmse
    print(var)
    print("R2",scoreR2)
    print("Mae",scoreMae)
    print("Mse",scoreMse)
    print("Rmse",scoreRmse)
    pickle.dump(modelsvr,open('SVR_2v_'+var,"wb"))
    print("------")

OXYGEN_TMP_
R2 0.8183507925293009
Mae 0.07267423598885343
Mse 0.009677854582705345
Rmse 0.09837608745373717
------
TCARBN_TMP_
R2 0.8115244655157685
Mae 0.07226451018334637
Mse 0.010041545683210787
Rmse 0.1002075131076048
------
SILCAT_TMP_
R2 0.8110820692554215
Mae 0.0708141674238311
Mse 0.010065115544787314
Rmse 0.10032504943825003
------
PHSPHT_TMP_
R2 0.8409025362698915
Mae 0.06814974055455614
Mse 0.008476349222198464
Rmse 0.09206709087507035
------
TCARBN_OXYGEN_
R2 0.8390812197846701
Mae 0.06832287765475738
Mse 0.008573384801590684
Rmse 0.09259257422488416
------
SILCAT_OXYGEN_
R2 0.8415764861353806
Mae 0.06913875350101792
Mse 0.008440442713796606
Rmse 0.09187188206299361
------
PHSPHT_OXYGEN_
R2 0.8329477699335442
Mae 0.07118451197079471
Mse 0.008900160990576183
Rmse 0.09434066456505479
------
SILCAT_TCARBN_
R2 0.821776203464889
Mae 0.07190955642336323
Mse 0.00949535651743865
Rmse 0.09744411997364771
------
PHSPHT_TCARBN_
R2 0.8285378522020121
Mae 0.07076044763597178
Mse 0.00913

In [90]:
svr_r2_2v

{'OXYGEN_TMP_': 0.8183507925293009,
 'TCARBN_TMP_': 0.8115244655157685,
 'SILCAT_TMP_': 0.8110820692554215,
 'PHSPHT_TMP_': 0.8409025362698915,
 'TCARBN_OXYGEN_': 0.8390812197846701,
 'SILCAT_OXYGEN_': 0.8415764861353806,
 'PHSPHT_OXYGEN_': 0.8329477699335442,
 'SILCAT_TCARBN_': 0.821776203464889,
 'PHSPHT_TCARBN_': 0.8285378522020121,
 'PHSPHT_SILCAT_': 0.8299217360621436}

In [93]:
svr_r2_2v['OXYGEN & TMP'] = svr_r2_2v.pop('OXYGEN_TMP_')
svr_r2_2v['TCARBN & TMP'] = svr_r2_2v.pop('TCARBN_TMP_')
svr_r2_2v['SILCAT & TMP'] = svr_r2_2v.pop('SILCAT_TMP_')
svr_r2_2v['PHSPHT & TMP'] = svr_r2_2v.pop('PHSPHT_TMP_')
svr_r2_2v['TCARBN & OXYGEN'] = svr_r2_2v.pop('TCARBN_OXYGEN_')
svr_r2_2v['SILCAT & OXYGEN'] = svr_r2_2v.pop('SILCAT_OXYGEN_')
svr_r2_2v['PHSPHT & OXYGEN'] = svr_r2_2v.pop('PHSPHT_OXYGEN_')
svr_r2_2v['SILCAT & TCARBN'] = svr_r2_2v.pop('SILCAT_TCARBN_')
svr_r2_2v['PHSPHT & TCARBN'] = svr_r2_2v.pop('PHSPHT_TCARBN_')
svr_r2_2v['PHSPHT & SILCAT'] = svr_r2_2v.pop('PHSPHT_SILCAT_')

In [94]:
Variables = list(svr_r2_2v.keys())
ScoresR2 = list(svr_r2_2v.values())
ScoreMae=list(svr_mae_2v.values())
ScoreMse=list(svr_mse_2v.values())
ScoreRmse=list(svr_Rmse_2v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [95]:
print("La meilleur combinaison pour deux variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Mae",ScoreMae[index])
print("Mse",ScoreMse[index])
print("Rmse",ScoreRmse[index])

La meilleur combinaison pour deux variables est: SILCAT & OXYGEN
R2 0.8415764861353806
Mae 0.06913875350101792
Mse 0.008440442713796606
Rmse 0.09187188206299361


**Entrainement sur une variables**

In [91]:
svr_r2_1v={}
svr_mae_1v={}
svr_mse_1v={}
svr_Rmse_1v={}
for a in range(0,len(comb1v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb1v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
    modelSvr =SVR()

    grids = GridSearchCV(modelSvr,Svrparam,cv=3,n_jobs=-1)
    grids.fit(xtr,y_train)
    modelsvr=grids.best_estimator_
    modelsvr.fit(xtr,y_train)
    ypred=modelsvr.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreMae=mae(y_test,ypred)
    scoreMse=mse(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    svr_r2_1v[var]=scoreR2
    svr_mae_1v[var]=scoreMae
    svr_mse_1v[var]=scoreMse
    svr_Rmse_1v[var]=scoreRmse
    print(var)
    print("R2",scoreR2)
    print("Mae",scoreMae)
    print("Mse",scoreMse)
    print("Rmse",scoreRmse)
    pickle.dump(modelsvr,open('SVR_1v_'+var,"wb"))
    print("------")

TMP_
R2 0.6451033746689192
Mae 0.09887503443342051
Mse 0.018908081018746453
Rmse 0.13750665808878657
------
OXYGEN_
R2 0.6283254724538143
Mae 0.10672094416850995
Mse 0.019801969299909614
Rmse 0.14071947022324102
------
TCARBN_
R2 0.7259506783997992
Mae 0.08960497795961127
Mse 0.01460072146675128
Rmse 0.12083344514972368
------
SILCAT_
R2 0.6791799933531426
Mae 0.09880747490172617
Mse 0.01709255666337919
Rmse 0.13073850489958644
------
PHSPHT_
R2 0.8160271824121614
Mae 0.07424059893736554
Mse 0.009801651218725385
Rmse 0.09900328892882997
------


In [96]:
svr_r2_1v

{'TMP_': 0.6451033746689192,
 'OXYGEN_': 0.6283254724538143,
 'TCARBN_': 0.7259506783997992,
 'SILCAT_': 0.6791799933531426,
 'PHSPHT_': 0.8160271824121614}

In [97]:
svr_r2_1v['TMP'] = svr_r2_1v.pop('TMP_')
svr_r2_1v['OXYGEN'] = svr_r2_1v.pop('OXYGEN_')
svr_r2_1v['TCARBN'] = svr_r2_1v.pop('TCARBN_')
svr_r2_1v['SILCAT'] = svr_r2_1v.pop('SILCAT_')
svr_r2_1v['PHSPHT'] = svr_r2_1v.pop('PHSPHT_')

In [98]:
Variables = list(svr_r2_1v.keys())
ScoresR2 = list(svr_r2_1v.values())
ScoreMae=list(svr_mae_1v.values())
ScoreMse=list(svr_mse_1v.values())
ScoreRmse=list(svr_Rmse_1v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [99]:
print("La meilleur combinaison pour une variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Mae",ScoreMae[index])
print("Mse",ScoreMse[index])
print("Rmse",ScoreRmse[index])

La meilleur combinaison pour une variables est: PHSPHT
R2 0.8160271824121614
Mae 0.07424059893736554
Mse 0.009801651218725385
Rmse 0.09900328892882997
