In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from mlxtend.evaluate import bias_variance_decomp
from xgboost import XGBRegressor

In [2]:
X_train = pd.read_csv('../../data/preprocesada/X_train.csv')

In [3]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Year_Factor,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,january_max_temp,february_min_temp,...,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,building_class_Commercial,building_class_Residential,label_State_Factor,target_facility_type
0,0,1,61242.0,1942.0,11.0,2.4,36,50.5,68,35,...,0,0,1.0,1.0,1.0,109.94301,1,0,0,241.135162
1,1,1,274000.0,1955.0,45.0,1.8,36,50.5,68,35,...,0,0,1.0,70.750627,1.0,12.0,1,0,0,39.559542
2,2,1,280025.0,1951.0,97.0,1.8,36,50.5,68,35,...,0,0,1.0,70.750627,1.0,12.0,1,0,0,100.965103
3,3,1,55325.0,1980.0,46.0,1.8,36,50.5,68,35,...,0,0,1.0,70.750627,1.0,12.0,1,0,0,69.441531
4,4,1,66000.0,1985.0,100.0,2.4,36,50.5,68,35,...,0,0,1.0,1.0,1.0,109.94301,1,0,0,38.209399


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

nRounds = 5

In [None]:
X_train = pd.read_csv('../../data/preprocesada/X_train.csv')
z_train = pd.read_csv('../../data/preprocesada/y_train.csv')

X_test = pd.read_csv('../../data/preprocesada/X_test.csv')
z_test = pd.read_csv('../../data/preprocesada/y_test.csv')

In [None]:
X_train.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
z_train.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
X_test.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
z_test.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

y_train = z_train['site_eui']
y_test = z_test['site_eui']

In [None]:
def check_bias_variance (m, X_train, y_train, X_test, y_test,  r):
    avg_expected_loss, avg_bias,avg_var  = bias_variance_decomp(m, 
                                                                X_train.to_numpy(),   
                                                                y_train.to_numpy(),   
                                                                X_test.to_numpy(),   
                                                                y_test.to_numpy(),  
                                                                loss='mse', 
                                                                num_rounds=r, 
                                                                random_seed=1)
    print('Bias Variance analisys')
    print('Average expected loss: %.3f' % avg_expected_loss)
    print('Average bias: %.3f' % avg_bias)
    print('Average variance: %.3f' % avg_var ) 

In [None]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics
    y_pred = model.predict(x_test)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False) # squared=False retorna RMSE/squared=True retorna MSE
    mae  = metrics.mean_absolute_error(y_test, y_pred)
    mse  = metrics.mean_squared_error(y_test, y_pred, squared=True)
    r2   = metrics.r2_score (y_test, y_pred)
    return {'rmse': rmse, 
            'mae' : mae, 
            'mse' : mse, 
            'r2'  : r2}

In [None]:
def generate_model(modelName, X_train, X_test, y_train, y_test ):    
    
    if modelName == 'LinearRegression':
        model = LinearRegression()
        model.fit(X_train, y_train)
        rf_eval = evaluate_model(model, X_test, y_test)
        print('RMSE:', rf_eval['rmse'])   
        print('MAE:', rf_eval['mae'])   
        print('MSE:', rf_eval['mse'])   
        print('R2:', rf_eval['r2'])   
        print('================================')   
        check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)      
    if modelName == 'DecisionTreeRegressor':
            model = DecisionTreeRegressor(max_depth=3)
            model.fit(X_train, y_train)
            rf_eval = evaluate_model(model, X_test, y_test)
            print('RMSE:', rf_eval['rmse'])   
            print('MAE:', rf_eval['mae'])   
            print('MSE:', rf_eval['mse'])   
            print('R2:', rf_eval['r2'])   
            print('================================')   
            check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)                  
    if modelName == 'SVR':
            model = SVR()
            model.fit(X_train, y_train['site_eui'])
            rf_eval = evaluate_model(model, X_test, y_test)
            print('RMSE:', rf_eval['rmse'])   
            print('MAE:', rf_eval['mae'])   
            print('MSE:', rf_eval['mse'])   
            print('R2:', rf_eval['r2'])   
            print('================================')   
            check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)                  
    if modelName == 'Lasso':
            model = linear_model.Lasso(alpha=0.1,  tol=1e-2)
            model.fit(X_train, y_train)
            rf_eval = evaluate_model(model, X_test, y_test)
            print('RMSE:', rf_eval['rmse'])   
            print('MAE:', rf_eval['mae'])   
            print('MSE:', rf_eval['mse'])   
            print('R2:', rf_eval['r2'])   
            print('================================')   
            check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)                  
    if modelName == 'RandomForestRegressor':
            model = RandomForestRegressor(max_depth=3)
            model.fit(X_train, y_train)
            rf_eval = evaluate_model(model, X_test, y_test)
            print('RMSE:', rf_eval['rmse'])   
            print('MAE:', rf_eval['mae'])   
            print('MSE:', rf_eval['mse'])   
            print('R2:', rf_eval['r2'])   
            print('================================')   
            check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)              
    if modelName == 'XGBoost':
            model = XGBRegressor(n_estimators=500, reg_alpha=0.01, n_jobs=-1)
            model.fit(X_train, y_train)
            rf_eval = evaluate_model(model, X_test, y_test)
            print('RMSE:', rf_eval['rmse'])   
            print('MAE:', rf_eval['mae'])   
            print('MSE:', rf_eval['mse'])   
            print('R2:', rf_eval['r2'])   
            print('================================')   
            check_bias_variance (model, X_train, y_train, X_test, y_test, nRounds)              


In [None]:
generate_model('LinearRegression', X_train, X_test, y_train, y_test)

In [None]:
generate_model('DecisionTreeRegressor', X_train, X_test, y_train, y_test)

In [None]:
generate_model('Lasso', X_train, X_test, y_train, y_test)

In [None]:
generate_model('RandomForestRegressor', X_train, X_test, y_train, y_test)

In [None]:
generate_model('XGBoost', X_train, X_test, y_train, y_test)