In [291]:
import pandas as pd
import numpy as np
import joblib
import pickle
import shap

from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from mlxtend.evaluate import bias_variance_decomp

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn.feature_selection import f_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures

In [292]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

file = 'X_3.csv'
file_y = 'y.csv'

nRounds = 5
r_state = 42

In [293]:
X = pd.read_csv('../data/processed/' + file)
z = pd.read_csv('../data/processed/' + file_y)

In [294]:
X.drop(columns=X.columns[0], axis=1, inplace=True)

In [295]:
X.shape

(752, 14)

In [296]:
z.shape

(752, 1)

In [297]:
y = z['yield']

In [298]:
y.shape

(752,)

In [299]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [300]:
def check_bias_variance (m, X_train, y_train, X_test, y_test,  r):
    avg_expected_loss, avg_bias,avg_var  = bias_variance_decomp(m, 
                                                                X_train.to_numpy(),   
                                                                y_train.to_numpy(),   
                                                                X_test.to_numpy(),   
                                                                y_test.to_numpy(),  
                                                                loss='mse', 
                                                                num_rounds=r, 
                                                                random_seed=1)
    print('Bias Variance analisys')
    print('Average expected loss: %.3f' % avg_expected_loss)
    print('Average bias: %.3f' % avg_bias)
    print('Average variance: %.3f' % avg_var ) 

In [301]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics
    y_pred = model.predict(x_test)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False) # squared=False retorna RMSE/squared=True retorna MSE
    mae  = metrics.mean_absolute_error(y_test, y_pred)
    mse  = metrics.mean_squared_error(y_test, y_pred, squared=True)
    r2   = metrics.r2_score (y_test, y_pred)
    return {'rmse': rmse, 
            'mae' : mae, 
            'mse' : mse, 
            'r2'  : r2}

In [302]:
def generate_model(modelName, X_train, X_test, y_train, y_test ):    
    if modelName == 'LinearRegression':
        model = LinearRegression()
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])   
        print('MAE:', rf_eval['mae'])           
        print('MSE:', rf_eval['mse'])           
        print('R2:', rf_eval['r2'])        
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)
    if modelName == 'DecisionTreeRegressor':
        model = DecisionTreeRegressor(random_state = r_state)
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])   
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)        
    if modelName == 'SVN':
        model = SVR()
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)        
    if modelName == 'Lasso':
        model = linear_model.Lasso()
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)        
    if modelName == 'RandomForestRegressor':
        model = RandomForestRegressor(random_state = r_state)
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])
        print('-----------------------------')     
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds) 
    if modelName == 'LinearSVR':
        model = LinearSVR(random_state = r_state)
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])
        print('-----------------------------')           
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)        
    if modelName == 'SGDRegressor':
        model = SGDRegressor(random_state=r_state)
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])  
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)                
    if modelName == 'AdaBoostRegressor':
        model = AdaBoostRegressor(random_state=r_state)
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])   
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)
    if modelName == 'GradientBoostingRegressor':  
        model = GradientBoostingRegressor(random_state=r_state)
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])  
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)    
    if modelName == 'XGBRegressor':
        model = XGBRegressor()
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])  
        print('-----------------------------')
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds) 
    return model

In [303]:
m = generate_model('LinearRegression', X_train, X_test, y_train, y_test)

RMSE: 117.39188476402283
MAE: 90.28752495180726
MSE: 13780.854608449616
R2: 0.991522113769003
-----------------------------
Bias Variance analisys
Average expected loss: 14094.296
Average bias: 13793.638
Average variance: 300.658


In [304]:
# poly = PolynomialFeatures(degree=2, include_bias=False)
# x_train_trans = poly.fit_transform(X_train)
# x_test_trans = poly.transform(X_test)

# m = generate_model('LinearRegression', 
#                    pd.DataFrame(x_train_trans, columns=X_train.columns), 
#                    pd.DataFrame(x_test_trans, columns=X_train.columns) , 
#                    y_train, 
#                    y_test)

# x_train_trans.shape

In [305]:
# generate_model('DecisionTreeRegressor', X_train, X_test, y_train, y_test)
# RMSE: 0.15216838613476735
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.029
# Average bias: 0.016
# Average variance: 0.013

In [306]:
# generate_model('SVN', X_train, X_test, y_train, y_test)
# RMSE: 0.14253784805325495
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.025
# Average bias: 0.023
# Average variance: 0.002

In [307]:
#  generate_model('Lasso', X_train, X_test, y_train, y_test)
# RMSE: 1.0264283728130068
# -----------------------------
# Bias Variance analisys
# Average expected loss: 1.011
# Average bias: 1.010
# Average variance: 0.001

In [308]:
# generate_model('RandomForestRegressor', X_train, X_test, y_train, y_test)
# RMSE: 0.11109118075435369
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.016
# Average bias: 0.013
# Average variance: 0.002

In [309]:
# generate_model('LinearSVR', X_train, X_test, y_train, y_test)
# RMSE: 0.0900785990812396
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.009
# Average bias: 0.008
# Average variance: 0.000

In [310]:
# generate_model('SGDRegressor', X_train, X_test, y_train, y_test)
# RMSE: 0.10869991825145987
# Bias Variance analisys
# Average expected loss: 0.012
# Average bias: 0.012
# Average variance: 0.000

In [311]:
# generate_model('AdaBoostRegressor', X_train, X_test, y_train, y_test)
# RMSE: 0.15853287428085086
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.027
# Average bias: 0.024
# Average variance: 0.003

In [312]:
# generate_model('GradientBoostingRegressor', X_train, X_test, y_train, y_test)
# RMSE: 0.0929275334390468
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.011
# Average bias: 0.009
# Average variance: 0.002

In [313]:
# generate_model('XGBRegressor', X_train, X_test, y_train, y_test)
# RMSE: 0.11026424531177682
# -----------------------------
# Bias Variance analisys
# Average expected loss: 0.014
# Average bias: 0.010
# Average variance: 0.003

In [314]:
joblib.dump(m, '../models/WildBlueberryYieldPrediction.joblib', compress=3)

['../models/WildBlueberryYieldPrediction.joblib']

In [315]:
pickle.dump(m, open('../models/WildBlueberryYieldPrediction.pkl', 'wb'))