In [59]:
import pyodbc
from meyerDB import cable_connection
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
import sklearn.preprocessing as pp

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

Database connection ok


In [20]:
# Whole ship cabling progress (weekly)
# Predictors:
# GT
# Algorithms:
# Mean

# Get the data

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):
    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    cursor.execute(
        "SELECT predictions.rw, predictions.cables_per_week as preds, test_projects.cables_per_week as reals FROM"
        " (SELECT rw, avg(cable_count) as cables_per_week FROM("
        " SELECT rw, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed WHERE running LIKE 'Y%' GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE rw IS NOT NULL AND running LIKE 'Y%' GROUP BY ship_readiness.project_id, totals.total, rw) cable_counts"
        " WHERE cable_counts.project_id <> {} AND cable_counts.project_id <> {}"
        " GROUP BY rw) predictions"
        " LEFT JOIN ("
        " SELECT rw, avg(cable_count) as cables_per_week FROM("
        " SELECT rw, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed WHERE running LIKE 'Y%' GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE rw IS NOT NULL AND running LIKE 'Y%' GROUP BY ship_readiness.project_id, totals.total, rw) cable_counts"
        " WHERE cable_counts.project_id = {} OR cable_counts.project_id = {}"
        " GROUP BY rw) test_projects"
        " ON test_projects.rw=predictions.rw"
        " WHERE predictions.cables_per_week IS NOT NULL AND test_projects.cables_per_week IS NOT NULL"
        " ORDER BY rw"
        .format(s1,s2,s1,s2)
        )
    data = np.array(cursor.fetchall()).astype('float32')
    x = data[:, 0]
    preds = data[:, 1]
    y_test = data[:, -1]
    #plt.plot(x, preds, 'r')
    #plt.plot(x, y_test, 'bo')
    #plt.show()
    score = mape(y_test, preds)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"-421,8%",,"316,0%",,"9,3%",,"-1302,3%",


In [21]:
# Whole ship cabling progress (monthly)
# Predictors:
# GT
# Algorithms:
# Mean

# Get the data

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):
    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    cursor.execute(
        "SELECT predictions.rm, predictions.cables_per_week as preds, test_projects.cables_per_week as reals FROM"
        " (SELECT rm, avg(cable_count) as cables_per_week FROM("
        " SELECT rm, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed WHERE running LIKE 'Y%' GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE rm IS NOT NULL AND running LIKE 'Y%' GROUP BY ship_readiness.project_id, totals.total, rm) cable_counts"
        " WHERE cable_counts.project_id <> {} AND cable_counts.project_id <> {}"
        " GROUP BY rm) predictions"
        " LEFT JOIN ("
        " SELECT rm, avg(cable_count) as cables_per_week FROM("
        " SELECT rm, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed WHERE running LIKE 'Y%' GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE rm IS NOT NULL AND running LIKE 'Y%' GROUP BY ship_readiness.project_id, totals.total, rm) cable_counts"
        " WHERE cable_counts.project_id = {} OR cable_counts.project_id = {}"
        " GROUP BY rm) test_projects"
        " ON test_projects.rm=predictions.rm"
        " WHERE predictions.cables_per_week IS NOT NULL AND test_projects.cables_per_week IS NOT NULL"
        " ORDER BY rm"
        .format(s1,s2,s1,s2)
        )
    data = np.array(cursor.fetchall()).astype('float32')
    x = data[:, 0]
    preds = data[:, 1]
    y_test = data[:, -1]
    #plt.plot(x, preds, 'r')
    #plt.plot(x, y_test, 'bo')
    #plt.show()
    score = mape(y_test, preds)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"-1432,2%",,"2737,2%",,"37,9%",,"-12063,6%",


In [22]:
# Whole ship design progress (weekly)
# Predictors:
# GT
# Algorithms:
# Mean

# Get the data

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):
    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    cursor.execute(
        "SELECT predictions.pw, predictions.cables_per_week as preds, test_projects.cables_per_week as reals FROM"
        " (SELECT pw, avg(cable_count) as cables_per_week FROM("
        " SELECT pw, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE pw IS NOT NULL GROUP BY ship_readiness.project_id, totals.total, pw) cable_counts"
        " WHERE cable_counts.project_id <> {} AND cable_counts.project_id <> {}"
        " GROUP BY pw) predictions"
        " LEFT JOIN ("
        " SELECT pw, avg(cable_count) as cables_per_week FROM("
        " SELECT pw, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE pw IS NOT NULL GROUP BY ship_readiness.project_id, totals.total, pw) cable_counts"
        " WHERE cable_counts.project_id = {} OR cable_counts.project_id = {}"
        " GROUP BY pw) test_projects"
        " ON test_projects.pw=predictions.pw"
        " WHERE predictions.cables_per_week IS NOT NULL AND test_projects.cables_per_week IS NOT NULL"
        " ORDER BY pw"
        .format(s1,s2,s1,s2)
        )
    data = np.array(cursor.fetchall()).astype('float32')
    x = data[:, 0]
    preds = data[:, 1]
    y_test = data[:, -1]
    #plt.plot(x, preds, 'r')
    #plt.plot(x, y_test, 'bo')
    #plt.show()
    score = mape(y_test, preds)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"-1299,7%",,"3105,8%",,"9,3%",,"-16264,5%",


In [23]:
# Whole ship design progress (monthly)
# Predictors:
# GT
# Algorithms:
# Mean

# Get the data

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):
    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    cursor.execute(
        "SELECT predictions.pm, predictions.cables_per_week as preds, test_projects.cables_per_week as reals FROM"
        " (SELECT pm, avg(cable_count) as cables_per_week FROM("
        " SELECT pm, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE pm IS NOT NULL GROUP BY ship_readiness.project_id, totals.total, pm) cable_counts"
        " WHERE cable_counts.project_id <> {} AND cable_counts.project_id <> {}"
        " GROUP BY pm) predictions"
        " LEFT JOIN ("
        " SELECT pm, avg(cable_count) as cables_per_week FROM("
        " SELECT pm, ship_readiness.project_id, sum(amount)/cast(totals.total as float) as cable_count FROM ship_readiness"
        " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed GROUP BY project_id) totals"
        " ON totals.project_id=ship_readiness.project_id"
        " WHERE pm IS NOT NULL GROUP BY ship_readiness.project_id, totals.total, pm) cable_counts"
        " WHERE cable_counts.project_id = {} OR cable_counts.project_id = {}"
        " GROUP BY pm) test_projects"
        " ON test_projects.pm=predictions.pm"
        " WHERE predictions.cables_per_week IS NOT NULL AND test_projects.cables_per_week IS NOT NULL"
        " ORDER BY pm"
        .format(s1,s2,s1,s2)
        )
    data = np.array(cursor.fetchall()).astype('float32')
    x = data[:, 0]
    preds = data[:, 1]
    y_test = data[:, -1]
    #plt.plot(x, preds, 'r')
    #plt.plot(x, y_test, 'bo')
    #plt.show()
    score = mape(y_test, preds)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"-4115,1%",,"19612,7%",,"48,4%",,"-145312,3%",


In [66]:
# Whole ship cabling progress (monthly)
# Predictors:
# GT
# Algorithms:
# K-NN Regressor

#get ship data
cursor.execute("SELECT project_id, gross_tonnage FROM projects")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
ship_gt = dict(zip(ships, data[:, 1]))

# init regression models
models = dict()
models['1-NN Regression'] = KNeighborsRegressor(1, weights='distance')
models['3-NN Regression'] = KNeighborsRegressor(3, weights='distance')
#models['5-NN Regression'] = KNeighborsRegressor(5, weights='distance')


# init cable intervals to zero
cursor.execute("UPDATE progress SET y_true=0")
cursor.commit()
# set real cable intervals
cursor.execute(
    "UPDATE progress SET progress.y_true=intervals.cable_count FROM ("
    " SELECT rw, ship_readiness.project_id, sum(amount) as cable_count FROM ship_readiness"
    " LEFT JOIN (SELECT project_id, sum(amount) as total FROM routed GROUP BY project_id) totals"
    " ON totals.project_id=ship_readiness.project_id"
    " WHERE ship_readiness.rw IS NOT NULL GROUP BY ship_readiness.project_id, totals.total, rw) intervals"
    " WHERE progress.project_id=intervals.project_id AND progress.wk=intervals.rw"
)
cursor.commit()

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(ships):

        X_train = []
        y_train = []
        for i in train_i:
            train_ship = ships[i]
            cursor.execute(
                "SELECT y_true FROM progress WHERE project_id = {}"
                .format(train_ship)
            )
            data = np.array(cursor.fetchall())
            X_train.append(ship_gt[train_ship])
            y_train.append(data[:, 0])
        X_train = np.array(X_train).reshape((-1, 1))
        y_train = np.array(y_train)
        y_train = y_train/y_train.sum(axis=1)[:, None]
        scaler = pp.MinMaxScaler()
        X_train = scaler.fit_transform(X_train) #normalize features

        X_test = []
        y_test = []
        for i in test_i:
            test_ship = ships[i]
            cursor.execute(
                "SELECT y_true FROM progress WHERE project_id = {}"
                .format(test_ship)
            )
            data = np.array(cursor.fetchall())
            X_test.append(ship_gt[train_ship])
            y_test.append(data[:, 0])
        X_test = np.array(X_test).reshape((-1, 1))
        y_test = np.array(y_test)
        y_test = y_test/y_test.sum(axis=1)[:, None]
        X_test = scaler.transform(X_test) #normalize features
        
        # fit the model
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        #plt.plot(y_test[1, :], 'r')
        #plt.plot(preds[1, :], 'b')
        #plt.show()
        # evaluate
        score = np.abs((y_test - preds)).sum()/4.0
        scores.append(score)

    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
1-NN Regression,,"43,1%",,"15,5%",,"69,2%",,"6,3%",
3-NN Regression,,"42,7%",,"15,3%",,"69,2%",,"6,3%",
