In [1]:
import pyodbc
from meyerDB import cable_connection
import PyQt5
import matplotlib.pyplot as plt
%matplotlib qt
import numpy as np
from IPython.display import HTML, display, clear_output

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import sklearn.preprocessing as pp
from scipy.optimize import curve_fit
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, RBF2, WhiteKernel, ConstantKernel, RationalQuadratic
from scipy.stats import norm

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

def logifunc(x,x0,k,l, A):
    #l = 2300
    return l / (1 + A*np.exp(-k*(x - x0)))

plt.rcParams['font.size'] = 14
plt.rcParams['font.weight'] = 'bold'
plt.rcParams["legend.loc"] = 'upper right'

Database connection ok


In [2]:
# get ensemble quantity

def get_ensemble_quantity(linear_pred, wk, ship):


    pred_ends = np.load('pred_ends.npy', allow_pickle=True)
    progresses = np.load('progress.npy', allow_pickle=True)


    data = pred_ends.item().get(ship)
    pred_end = data[np.argwhere(data[:, 0]==wk)][0][0][1]


    data = progresses.item().get(ship)
    cables = data[np.argwhere(data[:, 0]==wk)][0][0][1]

    estimated_completeness = cables/linear_pred
    if estimated_completeness < 0.002: return linear_pred

    pred = cables + pred_end

    estimated_completeness = np.square(cables/linear_pred)
    if estimated_completeness > 1.0: estimated_completeness = 1.0
    w1, w2 = 1-estimated_completeness, estimated_completeness
    avpred = (w1*linear_pred + w2*pred)/(w1+w2)
    return avpred



In [3]:
# Design progress
# Predictors:
# Time series
# Algorithms:
# S-curve fitting

def pplot():
    
    idx = np.argwhere(y > 0.99*y[-1])[0]
    plt.plot([x[idx]], [y[idx]], 'gx')
    est = logifunc(x, *params)
    try:
        idx = np.argwhere(est > 0.99*params[-2])[0]
    except IndexError:
        idx = -1
    plt.plot([-100, 0], [est[idx], est[idx]], 'grey')
    plt.plot([x[idx], x[idx]], [0, 1.3*y[-1]], 'grey')
    plt.plot([x[idx]], [est[idx]], 'kx')
    plt.plot(x[cut-1:], y[cut-1:], 'r')
    plt.plot(xx, yy, 'g')
    plt.plot(x[cut-1:], logifunc(x[cut-1:], *params), 'b--')
    plt.ylim([0, 1.3*y[-1]])
    plt.xlim([-100, 0])
    plt.pause(0.17)
    plt.cla()

def calc_error():
    est = logifunc(x[cut:], *params)
    res = est - y[cut:]
    err = np.sqrt(np.square(res).mean())/y[-1]
    return err

def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

plt.rcParams['font.size'] = 14
plt.rcParams['font.weight'] = 'bold'

# Get the data
# Get the data
cursor.execute("SELECT r.project_id, gross_tonnage, prototype, ship_squares, sum(amount) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON r.project_id=a.project_id"
    " GROUP BY r.project_id, gross_tonnage, prototype, ship_squares")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
gts = data[:, 1]
cables = data[:, -1]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

model = linear_model.LinearRegression()
model.fit(gts.reshape(-1, 1), cables)

wk = 'pw'
cursor.execute("UPDATE progress SET cables=0")
cursor.execute(
    "UPDATE progress SET progress.cables=t1.cables FROM"
    " (SELECT project_id, {}, sum(amount) as cables FROM ship_readiness GROUP BY project_id, {}) t1"
    " WHERE t1.project_id=progress.project_id AND t1.{}=progress.wk".format(wk, wk, wk)
)
cursor.commit()

ship_data = {}
for ship in ships:
    cursor.execute("SELECT wk, cables FROM progress WHERE project_id={} and cables > 100 ORDER BY wk".format(ship))
    data = cursor.fetchall()
    ship_data[ship] = np.array(data)


cv_errors = []
all_errs = []
completeness = []
for train_i, test_i in lpo.split(ships):
    model.fit(gts[train_i].reshape(-1, 1), cables[train_i])
    errors = []
    
    for ship in ships[test_i]:
        gt = gts[np.argwhere(ships==ship)][0]
        linear_pred = model.predict(gt.reshape(1, -1))[0]

        x = ship_data[ship][:, 0]
        ywk = ship_data[ship][:, 1]
        y = ywk.cumsum()
        
        goal = y[-1]
        for wk in np.arange(-100, 1):
            pred = get_ensemble_quantity(linear_pred, wk, ship) #1e6# 
            try:
                cut = np.argwhere(x <= wk)[-1][0] + 1
            except IndexError:
                continue

            xx = x[:cut]
            yy = y[:cut]
            real_completeness = yy[-1]/goal
            if real_completeness < 0.001: continue
            if real_completeness > 0.999: continue
            try:
                #params, _ = curve_fit(logifunc, xx, yy, bounds=([-100, 1e-3, 1e3-1, 1e-4], [-10, 2, 5e6+1, 100]))
                params, _ = curve_fit(logifunc, xx, yy, bounds=([-100, 1e-3, pred-1, 1e-4], [-10, 2, pred+1, 100]))
            except ValueError:
                params = (-50, 1, pred, 1)
            except TypeError:
                continue
            clear_output(wait=True)

            err = calc_error()

            errors.append(err)
            all_errs.append(err)
            completeness.append(real_completeness)

            pplot()

    errors = np.array(errors)
    cv_errors.append(errors.mean())
cv_errors = np.array(cv_errors)            
print(cv_errors.mean(), cv_errors.std(), cv_errors.max())


all_errs = np.array(all_errs)
completeness = np.array(completeness)
p = np.argsort(completeness)
completeness = completeness[p]
all_errs = all_errs[p]
print(all_errs.shape)
n = 170
all_errs = moving_average(all_errs, n)


plt.plot(100*completeness[n-1:], 100*all_errs, 'r')
plt.legend(['Error - Moving average (100)'])
#plt.title("S-Curve fitting with ensemble quantity prediction")
plt.xlabel("Completeness of cabling process (%)")
plt.ylabel('Error (%)')
plt.ylim(-10, 110)
plt.grid(which='major')
plt.savefig('cablingscurveensemble.png')




In [None]:
# Design progress
# Predictors:
# Time series
# Algorithms:
# Sequence summary

1/0
def calc_error(est, y):
    res = est - y
    err = abs(res).sum()/y.sum()
    return err
def pplot():
    
    idx = np.argwhere(y > 0.99*y[-1])[0]
    plt.plot([x[idx]], [y[idx]], 'gx')
    est = logifunc(x, *params)
    try:
        idx = np.argwhere(est > 0.99*params[-2])[0]
    except IndexError:
        idx = -1
    plt.plot([-100, 0], [est[idx], est[idx]], 'grey')
    plt.plot([x[idx], x[idx]], [0, 1.3*y[-1]], 'grey')
    plt.plot([x[idx]], [est[idx]], 'kx')
    plt.plot(x[cut-1:], y[cut-1:], 'r')
    plt.plot(xx, yy, 'g')
    plt.plot(x[cut-1:], logifunc(x[cut-1:], *params), 'b--')
    plt.ylim([0, 1.3*y[-1]])
    plt.xlim([-100, 0])
    plt.pause(0.17)
    plt.cla()

plt.rcParams['font.size'] = 14
plt.rcParams['font.weight'] = 'bold'

# Get the data
# Get the data
cursor.execute("SELECT r.project_id, gross_tonnage, prototype, ship_squares, sum(amount) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON r.project_id=a.project_id"
    " GROUP BY r.project_id, gross_tonnage, prototype, ship_squares")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
gts = data[:, 1]
cables = data[:, -1]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

model = linear_model.LinearRegression()
model.fit(gts.reshape(-1, 1), cables)

wk = 'pw'
cursor.execute("UPDATE progress SET cables=0")
cursor.execute(
    "UPDATE progress SET progress.cables=t1.cables FROM"
    " (SELECT project_id, {}, sum(amount) as cables FROM ship_readiness GROUP BY project_id, {}) t1"
    " WHERE t1.project_id=progress.project_id AND t1.{}=progress.wk".format(wk, wk, wk)
)
cursor.commit()

ship_data = {}
for ship in ships:
    cursor.execute("SELECT wk, cables FROM progress WHERE project_id={} ORDER BY wk".format(ship))
    data = cursor.fetchall()
    ship_data[ship] = np.array(data)

cv_errors = []
for train_i, test_i in lpo.split(ships):
    errors = []
    completeness = []
    forecast_function = [np.zeros(101), np.zeros(101)]
    for i in train_i:
        ship = ships[i]
        x = ship_data[ship][:, 0]/gts[i]
        ywk = ship_data[ship][:, 1]
        y = ywk.cumsum()/gts[i]
        forecast_function = [forecast_function[0] + x, forecast_function[1] + y]
    forecast_function[0] = forecast_function[0]/(ships.shape[0]-p)
    forecast_function[1] = forecast_function[1]/(ships.shape[0]-p)
    for i in test_i:
        ship = ships[i]
        x = ship_data[ship][:, 0]/gts[i]

        ywk = ship_data[ship][:, 1]
        y = ywk.cumsum()
        forecast = []
        for xx in x:
            idx = np.abs(forecast_function[0]-xx).argmin()
            forecast.append(forecast_function[1][idx]*gts[i])

        errors.append(calc_error(np.array(forecast), y))
        

    errors = np.array(errors)
    cv_errors.append(errors.mean())
cv_errors = np.array(cv_errors)            
print(cv_errors.mean(), cv_errors.std(), cv_errors.max())


ZeroDivisionError: division by zero