In [1]:
import pyodbc
from meyerDB import cable_connection
import PyQt5
import matplotlib.pyplot as plt
%matplotlib qt
import numpy as np
from IPython.display import HTML, display, clear_output

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import sklearn.preprocessing as pp
from scipy.optimize import curve_fit
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, RBF2, WhiteKernel, ConstantKernel, RationalQuadratic
from scipy.stats import norm

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

def logifunc(x,x0,k,l, A):
    #l = 2300
    return l / (1 + A*np.exp(-k*(x - x0)))

Database connection ok


In [2]:
# get ensemble quantity

def get_ensemble_quantity(linear_pred, wk, ship):


    pred_ends = np.load('pred_ends.npy', allow_pickle=True)
    progresses = np.load('progress.npy', allow_pickle=True)


    data = pred_ends.item().get(ship)
    pred_end = data[np.argwhere(data[:, 0]==wk)][0][0][1]


    data = progresses.item().get(ship)
    cables = data[np.argwhere(data[:, 0]==wk)][0][0][1]

    estimated_completeness = cables/linear_pred
    if estimated_completeness < 0.002: return linear_pred

    pred = cables + pred_end

    estimated_completeness = np.square(cables/linear_pred)
    if estimated_completeness > 1.0: estimated_completeness = 1.0
    w1, w2 = 1-estimated_completeness, estimated_completeness
    avpred = (w1*linear_pred + w2*pred)/(w1+w2)
    return avpred



In [3]:
# Design progress
# Predictors:
# Time series, GT
# Algorithms:
# Gaussian process regression

def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def calc_error():

    res = y_mean[xidxs_inv] - y[xidxs_inv]
    err = abs(res).sum()/(y[xidxs_inv]).sum()
    return err
    
def pplot():
    plt.plot(x[xidxs]*gt, y[xidxs]*gt, 'g')
    plt.plot(x[xidxs_inv]*gt, y[xidxs_inv]*gt, 'r')
    plt.plot(x[xidxs_inv]*gt, y_mean[xidxs_inv]*gt, 'b--')
    plt.plot(x[xidxs_inv]*gt, ycov[xidxs_inv]*gt, 'r--')
    plt.ylim([0, 4000000])
    plt.xlim([-100, 0])
    plt.pause(0.17)
    plt.cla()

# Get the data
cursor.execute("SELECT r.project_id, gross_tonnage, sum(amount) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, gross_tonnage")
data = np.array(cursor.fetchall())
ships = data[:, 0].astype('int32')
gts = data[:, 1]
cables = data[:, -1]
# init lpo split
p = 2 #ships
lpo = LeavePOut(p)


wk = 'pw'
cursor.execute("UPDATE progress SET cables=0")
cursor.execute(

    "UPDATE progress SET progress.cables=t1.cables FROM"
    " (SELECT project_id, {}, sum(amount) as cables FROM ship_readiness GROUP BY project_id, {}) t1"
    " WHERE t1.project_id=progress.project_id AND t1.{}=progress.wk".format(wk, wk, wk)
)
cursor.commit()

# get progress data
ship_data = {}
for i in range(ships.shape[0]):
    ship = ships[i]
    gt = gts[i]
    cursor.execute("SELECT wk, cables FROM progress WHERE project_id={} AND cables > 100 ORDER BY wk".format(ship))
    data = cursor.fetchall()
    ship_data[ship] = np.array(data)/gt

# init gaussian process regressor
kernel = ConstantKernel(constant_value=0.1, constant_value_bounds='fixed') * RBF(
    length_scale=0.0001, length_scale_bounds='fixed')
gp = GaussianProcessRegressor(kernel=kernel,
                            alpha=0.5)


cv_errors = []
for train_i, test_i in lpo.split(ships):
    completeness = []
    prior_x = []
    prior_y = []
    for i in range(11):
        if i in test_i: continue
        ship = ships[i]
        prior_x.append(ship_data[ship][:, 0])
        prior_y.append(ship_data[ship][:, 1])
    prior_y = np.concatenate(prior_y, axis=0)
    prior_x = np.concatenate(prior_x, axis=0)

    # search hyperparamters
    lengths = np.linspace(0.00001, 0.0005, 30)
    constants = np.linspace(0.01, 2.0, 30)
    noises = np.linspace(0.01, 1.0, 30)
    repeats = np.arange(1, 13, 3)
    best_error = None
    best_params = None
    errors = []
    for iii in range(0):
            # init gaussian process regressor
        l = np.random.choice(lengths)
        c = np.random.choice(constants)
        noise = np.random.choice(noises)
        r = np.random.choice(repeats)
        kernel = ConstantKernel(constant_value=c, constant_value_bounds='fixed') * RBF(
            length_scale=l, length_scale_bounds='fixed')
        gp = GaussianProcessRegressor(kernel=kernel,
                                    alpha=noise)
        for i in np.random.choice(train_i, 5):
            ship = ships[i]
            gt = gts[i]
            x = ship_data[ship][:, 0]
            for wk in np.arange(-100, 1):
                wks = wk/gts[i]
                y = ship_data[ship][:, 1]
                try:
                    xidxs = np.argwhere(x <= wks)[:, 0]
                    if xidxs.shape[0] == 0: continue
                    xidxs_inv = np.argwhere(x > wks)[:, 0]
                    if xidxs_inv.shape[0] == 0: continue
                    prior_idxs = np.argwhere(prior_x > wks)[:, 0]
                except IndexError:
                    continue
                estimated_completeness = y[xidxs].sum()
                if estimated_completeness < 0.001: continue
                real_completeness = y[xidxs].sum()/y.sum()
                if real_completeness > 0.999: continue
                X = np.hstack((np.repeat(x[xidxs], r), prior_x[prior_idxs]))
                Y = np.hstack((np.repeat(y[xidxs], r), prior_y[prior_idxs]))
                gp.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
                y_mean, y_cov = gp.predict(x.reshape(-1, 1), return_cov=True)
                y = y.cumsum()
                y_mean = np.hstack((y[xidxs], y_mean[xidxs_inv].cumsum() + y[xidxs][-1]*np.ones(xidxs_inv.shape[0])))
                clear_output(wait=True)
                err = calc_error()
                errors.append(err)
        errors = np.array(errors)
        err = errors.mean()
        if best_error is None or best_error > err:
            best_error = err
            best_params = (l, c, noise, r)
        errors = []
    (l, c, noise, r) = (0.0002103448275862069, 1.4510344827586208, 0.14655172413793105, 20)#best_params 
    kernel = ConstantKernel(constant_value=c, constant_value_bounds='fixed') * RBF(
        length_scale=l, length_scale_bounds='fixed')
    gp = GaussianProcessRegressor(kernel=kernel,
                                alpha=noise)
    for i in test_i:
        ship = ships[i]
        if ship != 1394: continue
        gt = gts[i]
        x = ship_data[ship][:, 0]
        for wk in np.arange(-100, 1):
            wks = wk/gts[i]
            y = ship_data[ship][:, 1]
            try:
                xidxs = np.argwhere(x <= wks)[:, 0]
                if xidxs.shape[0] == 0: continue
                xidxs_inv = np.argwhere(x > wks)[:, 0]
                if xidxs_inv.shape[0] == 0: continue
                prior_idxs = np.argwhere(prior_x > wks)[:, 0]
            except IndexError:
                continue
            estimated_completeness = y[xidxs].sum()
            if estimated_completeness < 0.001: continue
            real_completeness = y[xidxs].sum()/y.sum()
            if real_completeness > 0.999: continue
            X = np.hstack((np.repeat(x[xidxs], r), prior_x[prior_idxs]))
            Y = np.hstack((np.repeat(y[xidxs], r), prior_y[prior_idxs]))
            gp.fit(X.reshape(-1, 1), Y.reshape(-1, 1))
            y_mean, y_cov = gp.predict(x.reshape(-1, 1), return_cov=True)
            
            y = y.cumsum()
            y_mean = np.hstack((y[xidxs], y_mean[xidxs_inv].cumsum() + y[xidxs][-1]*np.ones(xidxs_inv.shape[0])))
            ycov = 300*np.diag(y_cov) + y_mean

            clear_output(wait=True)
            err = calc_error()
            errors.append(err)
            #completeness.append(real_completeness)
            pplot()


    errors = np.array(errors)
    cv_errors.append(errors.mean())
cv_errors = np.array(cv_errors)            
print(cv_errors.mean(), cv_errors.std(), cv_errors.max())


#(0.000263448275862069, 1.5196551724137932, 0.41965517241379313, 7)
#(0.00014517241379310343, 0.21586206896551724, 0.11241379310344828, 10)
#(1e-05, 0.21586206896551724, 0.6586206896551725, 4)
#(0.0003479310344827586, 1.7941379310344827, 0.4537931034482759, 1)
#(6.0689655172413786e-05, 1.1079310344827586, 0.4537931034482759, 7)
#(0.0003817241379310345, 1.2451724137931035, 0.14655172413793105, 10)
#(0.000246551724137931, 0.9020689655172414, 0.2831034482758621, 7)
#(0.00044931034482758616, 0.9706896551724138, 0.38551724137931037, 1)
#(2.6896551724137928e-05, 0.14724137931034484, 0.8975862068965518, 4)
#(0.0002803448275862069, 1.4510344827586208, 0.14655172413793105, 4)
#(0.00043241379310344824, 0.6962068965517242, 0.6927586206896552, 1)
#(4.379310344827586e-05, 0.4903448275862069, 0.2831034482758621, 1)
#(9.448275862068964e-05, 1.176551724137931, 0.5562068965517242, 4)
#(0.00014517241379310343, 1.176551724137931, 0.8975862068965518, 7)
#(0.00012827586206896552, 0.42172413793103447, 0.863448275862069, 7)
#(0.00044931034482758616, 1.9313793103448276, 0.11241379310344828, 1)
#(0.00016206896551724135, 0.21586206896551724, 0.9317241379310346, 4)
#(9.448275862068964e-05, 0.9020689655172414, 0.01, 1)


  cv_errors.append(errors.mean())
  ret = ret.dtype.type(ret / rcount)
