In [1]:
import pyodbc
from meyerDB import cable_connection
import PyQt5
import matplotlib.pyplot as plt
%matplotlib qt
import numpy as np
from IPython.display import HTML, display, clear_output

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import sklearn.preprocessing as pp
from scipy.optimize import curve_fit
 

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

def logifunc(x,x0,k,l, A):
    #l = 2300
    return l / (1 + A*np.exp(-k*(x - x0)))

Database connection ok


In [2]:
# get ensemble quantity

def get_ensemble_quantity(linear_pred, wk, ship):


    pred_ends = np.load('pred_ends.npy', allow_pickle=True)
    progresses = np.load('progress.npy', allow_pickle=True)


    data = pred_ends.item().get(ship)
    pred_end = data[np.argwhere(data[:, 0]==wk)][0][0][1]


    data = progresses.item().get(ship)
    cables = data[np.argwhere(data[:, 0]==wk)][0][0][1]

    estimated_completeness = cables/linear_pred
    if estimated_completeness < 0.002: return linear_pred

    pred = cables + pred_end

    estimated_completeness = np.square(cables/linear_pred)
    if estimated_completeness > 1.0: estimated_completeness = 1.0
    w1, w2 = 1-estimated_completeness, estimated_completeness
    avpred = (w1*linear_pred + w2*pred)/(w1+w2)
    return avpred





In [3]:
# Design progress
# Predictors:
# Time series
# Algorithms:
# S-curve fitting

def pplot():
    
    idx = np.argwhere(y > 0.99*y[-1])[0]
    plt.plot([x[idx]], [y[idx]], 'gx')
    est = logifunc(x, *params)
    try:
        idx = np.argwhere(est > 0.99*params[-2])[0]
    except IndexError:
        idx = -1
    plt.plot([-100, 0], [est[idx], est[idx]], 'grey')
    plt.plot([x[idx], x[idx]], [0, 1.3*y[-1]], 'grey')
    plt.plot([x[idx]], [est[idx]], 'kx')
    plt.plot(x[cut-1:], y[cut-1:], 'r')
    plt.plot(xx, yy, 'g')
    plt.plot(x[cut-1:], logifunc(x[cut-1:], *params), 'b--')
    plt.ylim([0, 1.3*y[-1]])
    plt.xlim([-100, 0])
    plt.pause(0.17)
    plt.cla()

def revcum(x):
    x_cumsum_shifted = np.insert(np.delete(x, -1), 0, 0)
    _x = x - x_cumsum_shifted
    return _x


plt.rcParams['font.size'] = 14
plt.rcParams['font.weight'] = 'bold'

# Get the data
# Get the data
cursor.execute("SELECT r.project_id, gross_tonnage, prototype, ship_squares, sum(amount) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON r.project_id=a.project_id"
    " GROUP BY r.project_id, gross_tonnage, prototype, ship_squares")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
gts = data[:, 1]
cables = data[:, -1]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

model = linear_model.LinearRegression()
model.fit(gts.reshape(-1, 1), cables)

wk = 'pw'
cursor.execute("UPDATE progress SET cables=0")
cursor.execute(
    "UPDATE progress SET progress.cables=t1.cables FROM"
    " (SELECT project_id, {}, sum(amount) as cables FROM ship_readiness GROUP BY project_id, {}) t1"
    " WHERE t1.project_id=progress.project_id AND t1.{}=progress.wk".format(wk, wk, wk)
)
cursor.commit()

ship_data = {}
for ship in ships:
    cursor.execute("SELECT wk, cables FROM progress WHERE project_id={} and cables > 100 ORDER BY wk".format(ship))
    data = cursor.fetchall()
    ship_data[ship] = np.array(data)

cv_errors = []
for train_i, test_i in lpo.split(ships):
    model.fit(gts[train_i].reshape(-1, 1), cables[train_i])
    errors = []
    completeness = []
    for ship in ships[test_i]:
        gt = gts[np.argwhere(ships==ship)][0]
        linear_pred = model.predict(gt.reshape(1, -1))[0]

        x = ship_data[ship][:, 0]
        ywk = ship_data[ship][:, 1]
        y = ywk.cumsum()
        
        goal = y[-1]
        for wk in np.arange(-100, 1):
            pred = get_ensemble_quantity(linear_pred, wk, ship) #1e6# 
            try:
                cut = np.argwhere(x <= wk)[-1][0] + 1
            except IndexError:
                continue

            xx = x[:cut]
            yy = y[:cut]
            estimated_completeness = yy[-1]/linear_pred
            if estimated_completeness < 0.001: continue
            if yy[-1]/goal > 0.999: continue
            real_completeness = yy[-1]/goal
            try:
                #params, _ = curve_fit(logifunc, xx, yy, bounds=([-100, 1e-3, 1e3-1, 1e-4], [-10, 2, 5e6+1, 100]))
                params, _ = curve_fit(logifunc, xx, yy, bounds=([-100, 1e-3, pred-1, 1e-4], [-10, 2, pred+1, 100]))
            except ValueError:
                params = (-50, 1, pred, 1)
            except RuntimeError:
                params = (-50, 1, pred, 1)
            clear_output(wait=True)
            err = abs(logifunc(x[cut:], *params).sum() - y[cut:].sum())
            errors.append(err)
            completeness.append(real_completeness)

            pplot()

    errors = np.array(errors)
    rmse = (errors.sum()/errors.shape[0])/y.sum()
    cv_errors.append(rmse)
cv_errors = np.array(cv_errors)            
print(cv_errors.mean(), cv_errors.std(), cv_errors.max())


