In [6]:
from bokeh.io import output_notebook
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
output_notebook()

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option("mode.chained_assignment", None)

In [8]:
df = pd.read_csv(r"csv-files\fedex\test_data.csv", header=0, low_memory=False)
print(df.shape)

(18000, 26)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   y       18000 non-null  float64
 1   x0      18000 non-null  float64
 2   x1      18000 non-null  float64
 3   x2      18000 non-null  float64
 4   x3      18000 non-null  float64
 5   x4      18000 non-null  float64
 6   x5      18000 non-null  float64
 7   x6      18000 non-null  float64
 8   x7      18000 non-null  float64
 9   x8      18000 non-null  float64
 10  x9      18000 non-null  float64
 11  x10     18000 non-null  float64
 12  x11     18000 non-null  float64
 13  x12     18000 non-null  float64
 14  x13     18000 non-null  float64
 15  x14     18000 non-null  float64
 16  x15     18000 non-null  float64
 17  x16     9062 non-null   float64
 18  x17     15186 non-null  float64
 19  x18     18000 non-null  float64
 20  x19     18000 non-null  float64
 21  x20     18000 non-null  float64
 22

In [10]:
# Drop null values
df = df.dropna()

def hist_interact(dataframe, column, colors=["SteelBlue", "Tan"], bins=30, log_scale=False, show_plot=True):

    # build histogram data with Numpy
    hist, edges = np.histogram(dataframe[column], bins = bins)
    hist_df = pd.DataFrame({column: hist,
                             "left": edges[:-1],
                             "right": edges[1:]})
    hist_df["interval"] = ["%d to %d" % (left, right) for left, 
                           right in zip(hist_df["left"], hist_df["right"])]

    # bokeh histogram with hover tool
    if log_scale == True:
        hist_df["log"] = np.log(hist_df[column])
        src = ColumnDataSource(hist_df)
        plot = figure(plot_height = 600, plot_width = 600,
              title = "Histogram of {}".format(column.capitalize()),
              x_axis_label = column.capitalize(),
              y_axis_label = "Log Count")    
        plot.quad(bottom = 0, top = "log",left = "left", 
            right = "right", source = src, fill_color = colors[0], 
            line_color = "black", fill_alpha = 0.7,
            hover_fill_alpha = 1.0, hover_fill_color = colors[1])
    else:
        src = ColumnDataSource(hist_df)
        plot = figure(plot_height = 600, plot_width = 600,
              title = "Histogram of {}".format(column.capitalize()),
              x_axis_label = column.capitalize(),
              y_axis_label = "Count")    
        plot.quad(bottom = 0, top = column,left = "left", 
            right = "right", source = src, fill_color = colors[0], 
            line_color = "black", fill_alpha = 0.7,
            hover_fill_alpha = 1.0, hover_fill_color = colors[1])
    # hover tool
    hover = HoverTool(tooltips = [('Interval', '@interval'),
                              ('Count', str("@" + column))])
    plot.add_tools(hover)
    # output
    if show_plot == True:
        show(plot)
    else:
        return plot


In [11]:
hist_interact(df, "y", bins=25)

In [15]:
# Get independent and dependent data sets
y = df.loc[:, "y"]
X = df.drop("y", axis=1)

results = dict(std_scaler = None, minmax_scaler = None)

In [16]:
# Standard scaling outcome
ss = StandardScaler()
Xss = ss.fit_transform(X)

Xss_train, Xss_test, y_train, y_test = train_test_split(Xss, y, test_size=0.25, random_state=13)


params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

gbr_regr = GradientBoostingRegressor(**params)
gbr_regr.fit(Xss_train, y_train)

mse_res = mean_squared_error(y_test, gbr_regr.predict(Xss_test))
# print(f"The mean squared error (MSE) on test set: {mse_res:.4f}")
results["std_scaler"] = mse_res

In [17]:
# MinMax scaling outcome
mms = MinMaxScaler()
Xmms = mms.fit_transform(X)

Xmms_train, Xmms_test, y_train, y_test = train_test_split(Xss, y, test_size=0.25, random_state=13)


params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

gbr_regr = GradientBoostingRegressor(**params)
gbr_regr.fit(Xmms_train, y_train)

mse_res = mean_squared_error(y_test, gbr_regr.predict(Xmms_test))
results["minmax_scaler"] = mse_res

In [18]:
msg  = "\n".join([f"The mean squared error (MSE) for the {k} was: {v:.4f}" for k, v in results.items()])
print(msg)

The mean squared error (MSE) for the std_scaler was: 0.0015
The mean squared error (MSE) for the minmax_scaler was: 0.0015
