# Modèles non linéaires test avec un nouveau jeux de données "clean"

In [1]:
import matplotlib.pyplot as plt
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import explained_variance_score,mean_absolute_error
from time import time
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
data_base = pd.read_csv("data_clean.csv")

In [3]:
data= data_base[["product_price", "weight", "lenght", "height", "width", "distance", "prix_de_livraison"]]
data= data.dropna()
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

#Perform Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_sc,y,test_size=0.2)


In [4]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    ElasticNet(),
    Ridge()
]

In [5]:
head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()

KNeighborsRegressor()
	Training time: 0.064s
	Prediction time: 0.247s
	Explained variance: 0.5338541685353915
	Mean absolute error: 3.1404907903816555
	R2 score: 0.5336311840464466

GradientBoostingRegressor()
	Training time: 5.242s
	Prediction time: 0.015s
	Explained variance: 0.5749174700322568
	Mean absolute error: 3.088276867288908
	R2 score: 0.5749173937818894

ExtraTreesRegressor()
	Training time: 8.612s
	Prediction time: 0.551s
	Explained variance: 0.6080608389449449
	Mean absolute error: 2.6249316847319335
	R2 score: 0.6080176775009851

RandomForestRegressor()
	Training time: 22.873s
	Prediction time: 0.473s
	Explained variance: 0.6198277241832163
	Mean absolute error: 2.7182794552549407
	R2 score: 0.619752922497607

DecisionTreeRegressor()
	Training time: 0.353s
	Prediction time: 0.005s
	Explained variance: 0.30408201737080076
	Mean absolute error: 3.3696236491295313
	R2 score: 0.30407971935445854

LinearRegression()
	Training time: 0.006s
	Prediction time: 0.001s
	Explained v

In [6]:
from sklearn.model_selection import GridSearchCV
parameters = { 'loss' : ['squared_error', 'absolute_error', 'huber', 'quantile'],
              'learning_rate' : (0.05,0.25,0.50,1),
              'criterion' : ['friedman_mse', 'mse', 'mae'],
              'max_features' : ['auto', 'sqrt', 'log2']
             }

## On se rend compte que les outliers n'ont pas étaient traités de la meilleure façon car les scores de nos algorithmes diminuent. Nous n'utiliseront pas les données data_clean pour entrainer les algos.