In [1]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from itertools import combinations

In [2]:
df = pd.read_csv('PacificaClean.csv')

In [3]:
df

Unnamed: 0,CTDTMP [ITS-90],OXYGEN [UMOL/KG],TCARBN [UMOL/KG],PHSPHT [UMOL/KG],SILCAT [UMOL/KG],NITRAT [UMOL/KG],PH
0,4.547,119.10,2286.68,2.824,70.516,37.126,7.552423
1,4.533,100.64,2290.58,2.744,71.046,38.908,7.518387
2,4.536,130.80,2282.86,2.532,62.012,35.762,7.514944
3,4.536,130.80,2282.86,2.532,62.012,35.762,7.514944
4,4.536,130.80,2282.86,2.532,62.012,35.762,7.514944
...,...,...,...,...,...,...,...
277318,3.494,67.60,2290.04,2.880,120.580,41.750,7.565329
277319,2.898,82.60,2325.14,2.780,129.890,40.660,7.508111
277320,2.621,99.60,2342.30,2.770,133.730,40.110,7.487689
277321,2.339,115.70,2347.40,2.710,136.900,39.560,7.536210


In [4]:
nouveaux_noms = {
    'CTDTMP [ITS-90]': 'temp',
    'OXYGEN [UMOL/KG]': 'oxygen',
    'TCARBN [UMOL/KG]': 'tcarbn',
    'PHSPHT [UMOL/KG]': 'phspht',
    'SILCAT [UMOL/KG]': 'silcat',
    'NITRAT [UMOL/KG]': 'nitrat',
}
# Renommer les colonnes dans le DataFrame
df.rename(columns=nouveaux_noms, inplace=True)

In [5]:
X=df.drop('PH', axis=1)
Y=df['PH']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [6]:
pipeline=Pipeline([
     ('scaler',StandardScaler()),
     ('SVR', SVR())
   
])

In [8]:
parameters = {'C': [1.1, 5.4],
              'kernel': ['linear', 'rbf'],
              'gamma': [0.7001, 0.001,1,5],
              'epsilon':[0.01, 0.1, 1.0]}

In [None]:
grid_search = GridSearchCV(pipeline, parameters,cv=5)
grid_search.fit(X_train,y_train)

In [None]:
best_params = grid_search.best_params_

In [None]:
print("Meilleurs paramètres:", best_params)

In [None]:
best_model = grid_search.best_estimator_
best_model.fit(X_train,y_train)

In [None]:
y_pred = best_model.predict(X_test)

In [None]:
print("Meilleur score de validation croisée :", grid_search.best_score_)

In [None]:
rmse =mse(y_test,y_pred,squared=False)

In [None]:
print("Rmse =",rmse)

In [None]:
# Save the model to a file
with open("all_features_SVR.pkl", "wb") as file:
       dump(best_model, file)
   

In [None]:
temp=['CTDTMP [ITS-90]']
oxyg=['OXYGEN [UMOL/KG]']
tCarb=['TCARBN [UMOL/KG]']
silicate=['SILCAT [UMOL/KG]']
phosphate=['PHSPHT [UMOL/KG]']
nitrate=['NITRAT [UMOL/KG]']
#Création des combinaisons de cinq variables
comb5Var = combinations([temp, oxyg,tCarb, silicate,phosphate,nitrate],5)
#Création des combinaisons de quatre variables
comb4Var = combinations([temp, oxyg,tCarb, silicate,phosphate,nitrate],4)
#Création des combinaisons de trois variables
comb3Var = combinations([temp, oxyg,tCarb, silicate,phosphate,nitrate],3)
#Création des combinaisons de deux variables
comb2Var = combinations([temp, oxyg,tCarb, silicate,phosphate,nitrate],2)
#Création des combinaisons de un variable
comb1Var = combinations([temp, oxyg,tCarb, silicate,phosphate,nitrate],1)

In [None]:
comb5v=list(comb5Var)
comb4v=list(comb4Var)
comb3v=list(comb3Var)
comb2v=list(comb2Var)
comb1v=list(comb1Var)

### Entrainement sur cinq variables

In [None]:
R2_5v={}
Rmse_5v={}
for a in range(0,len(comb5v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb5v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
    grids = GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1)

    grids.fit(xtr,y_train)
    modelrfc=grids.best_estimator_
    ypred=modelrfc.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    R2_5v[var]=scoreR2
    Rmse_5v[var]=scoreRmse
    print("R2",scoreR2)
    print("Rmse",scoreRmse)
    dump(modelrfc,open('SVR_5v_'+var,"wb"))
    print("------")

In [None]:
R2_5v

In [None]:
 Rmse_5v

In [None]:
Variables = list(R2_5v.keys())
ScoresR2 = list(R2_5v.values())
ScoreRmse=list(Rmse_5v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [None]:
print("La meilleur combinaison pour quatre variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Rmse",ScoreRmse[index])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoresR2)
plt.ylabel('R2 Score')
plt.title(' R2 pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoresR2):
    plt.text(i, score, f'R2 = {score:.4f}', ha='center', va='bottom')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoreRmse)
plt.ylabel('RMSE Score')
plt.title('RMSE pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoreRmse):
    plt.text(i, score, f'Rmse = {score:.4f}', ha='center', va='bottom')
plt.show()

### Entrainement sur quatre variables

In [None]:
R2_4v={}
Rmse_4v={}
for a in range(0,len(comb4v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb4v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
            
    

    grids = GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1)

    grids.fit(xtr,y_train)
    modelrfc=grids.best_estimator_
    ypred=modelrfc.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    R2_4v[var]=scoreR2
    Rmse_4v[var]=scoreRmse
    print("R2",scoreR2)
    print("Rmse",scoreRmse)
    dump(modelrfc,open('SVR_4v_'+var,"wb"))
    print("------")

In [None]:
R2_4v

In [None]:
Rmse_4v

In [None]:
Variables = list(R2_5v.keys())
ScoresR2 = list(R2_5v.values())
ScoreRmse=list(Rmse_5v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [None]:
print("La meilleur combinaison pour quatre variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Rmse",ScoreRmse[index])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoresR2)
plt.ylabel('R2 Score')
plt.title(' R2 pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoresR2):
    plt.text(i, score, f'R2 = {score:.4f}', ha='center', va='bottom')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoreRmse)
plt.ylabel('RMSE Score')
plt.title('RMSE pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoreRmse):
    plt.text(i, score, f'Rmse = {score:.4f}', ha='center', va='bottom')
plt.show()

### Entrainement sur trois variables 

In [None]:
R2_3v={}
Rmse_3v={}
for a in range(0,len(comb3v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb3v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
            
    

    grids = GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1)

    grids.fit(xtr,y_train)
    modelrfc=grids.best_estimator_
    ypred=modelrfc.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    R2_3v[var]=scoreR2
    Rmse_3v[var]=scoreRmse
    print("R2",scoreR2)
    print("Rmse",scoreRmse)
    dump(modelrfc,open('SVR_3v_'+var,"wb"))
    print("------")

In [None]:
R2_3v

In [None]:
Rmse_3v

In [None]:
Variables = list(R2_5v.keys())
ScoresR2 = list(R2_5v.values())
ScoreRmse=list(Rmse_5v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [None]:
print("La meilleur combinaison pour quatre variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Rmse",ScoreRmse[index])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoresR2)
plt.ylabel('R2 Score')
plt.title(' R2 pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoresR2):
    plt.text(i, score, f'R2 = {score:.4f}', ha='center', va='bottom')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoreRmse)
plt.ylabel('RMSE Score')
plt.title('RMSE pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoreRmse):
    plt.text(i, score, f'Rmse = {score:.4f}', ha='center', va='bottom')
plt.show()

### Entrainement sur deux variables 

In [None]:
R2_2v={}
Rmse_2v={}
for a in range(0,len(comb2v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb2v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
            
    

    grids = GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1)

    grids.fit(xtr,y_train)
    modelrfc=grids.best_estimator_
    ypred=modelrfc.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    R2_2v[var]=scoreR2
    Rmse_2v[var]=scoreRmse
    print("R2",scoreR2)
    print("Rmse",scoreRmse)
    dump(modelrfc,open('SVR_2v_'+var,"wb"))
    print("------")

In [None]:
R2_2v

In [None]:
Rmse_2v

In [None]:
Variables = list(R2_5v.keys())
ScoresR2 = list(R2_5v.values())
ScoreRmse=list(Rmse_5v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [None]:
print("La meilleur combinaison pour quatre variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Rmse",ScoreRmse[index])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoresR2)
plt.ylabel('R2 Score')
plt.title(' R2 pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoresR2):
    plt.text(i, score, f'R2 = {score:.4f}', ha='center', va='bottom')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoreRmse)
plt.ylabel('RMSE Score')
plt.title('RMSE pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoreRmse):
    plt.text(i, score, f'Rmse = {score:.4f}', ha='center', va='bottom')
plt.show()

### Entrainement sur un variable

In [None]:
R2_1v={}
Rmse_1v={}
for a in range(0,len(comb1v)):
    xtr=pd.DataFrame()
    xts=pd.DataFrame()
    for i in comb1v[a] :
        for x in i:
            xtr.insert(0,x,X_train[x])
            xts.insert(0,x,X_test[x])
            
    

    grids = GridSearchCV(pipeline,parameters,cv=5,n_jobs=-1)

    grids.fit(xtr,y_train)
    modelrfc=grids.best_estimator_
    ypred=modelrfc.predict(xts)
    scoreR2=r2_score(y_test,ypred)
    scoreRmse = mse(y_test,ypred,squared=False)
    var=''
    for col in xtr.columns:
        var+=col+'_'
    R2_1v[var]=scoreR2
    Rmse_1v[var]=scoreRmse
    print("R2",scoreR2)
    print("Rmse",scoreRmse)
    dump(modelrfc,open('SVR_1v_'+var,"wb"))
    print("------")

In [None]:
R2_1v

In [None]:
Rmse_1v

In [None]:
Variables = list(R2_5v.keys())
ScoresR2 = list(R2_5v.values())
ScoreRmse=list(Rmse_5v.values())
tmp = max(ScoresR2)
index = ScoresR2.index(tmp)

In [None]:
print("La meilleur combinaison pour quatre variables est:",Variables[index])
print("R2",ScoresR2[index])
print("Rmse",ScoreRmse[index])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoresR2)
plt.ylabel('R2 Score')
plt.title(' R2 pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoresR2):
    plt.text(i, score, f'R2 = {score:.4f}', ha='center', va='bottom')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(Variables, ScoreRmse)
plt.ylabel('RMSE Score')
plt.title('RMSE pour différentes combinaisons')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()
for i, score in enumerate(ScoreRmse):
    plt.text(i, score, f'Rmse = {score:.4f}', ha='center', va='bottom')
plt.show()