In [1]:
import pyodbc
from meyerDB import cable_connection
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_absolute_error as mae
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import sklearn.preprocessing as pp

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

Database connection ok


In [11]:
# Whole ship design progress (weekly)
# Predictors:
# Nothing
# Algorithms:
# Mean

#get ship data
cursor.execute("SELECT project_id, gross_tonnage FROM projects")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# set real counts
cursor.execute("UPDATE progress SET y_true=0")
cursor.commit()
cursor.execute(
    " update area_progress set y_true=t2.reals from"
    " (SELECT project_id, area, pw, count(*) as reals from area_dataset"
    " GROUP BY project_id, area, pw ) t2"
    " WHERE area_progress.wk=t2.pw AND area_progress.project_id=t2.project_id AND area_progress.area=t2.area"
)
cursor.commit()
# normalize values
cursor.execute(
    "UPDATE area_progress SET y_true=y_true/ysum"
    " FROM (SELECT sum(y_true) as ysum FROM area_progress GROUP BY project_id, area) sums"
)
cursor.commit()


# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):

    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    # set mean predictions
    cursor.execute("UPDATE progress SET pred=0")
    cursor.commit()
    cursor.execute(
        " update area_progress set pred=CASE WHEN avg_count IS NULL THEN 0 ELSE avg_count END from ("
        " SELECT pw, avg(reals) avg_count FROM"
        " (SELECT area, pw, count(*) as reals from area_dataset"
        " WHERE project_id<>{} AND project_id<>{}"
        " GROUP BY project_id, area, pw ) t2  GROUP BY pw) t3"
        " WHERE t3.pw=area_progress.wk"
        .format(s1, s2)
    )
    cursor.commit()
    # normalize values
    cursor.execute(
        "UPDATE area_progress SET pred=pred/predsum"
        " FROM (SELECT sum(pred) as predsum FROM area_progress GROUP BY project_id, area) sums"
    )
    cursor.commit()

    # get data
    cursor.execute(
        "SELECT pred, y_true FROM area_progress"
        " WHERE project_id={} OR project_id={}"
        .format(s1, s2)
    )
    data = np.array(cursor.fetchall()).astype('float32')
    n = data.shape[0]/101 # number of areas
    preds = data[:, 0]
    y_test = data[:, -1]

    # evaluate
    score = np.abs((y_test - preds)).sum()/(2*n)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"31,7%",,"5,0%",,"39,9%",,"20,7%",


In [17]:
# Whole ship design progress (monthly)
# Predictors:
# Nothing
# Algorithms:
# Mean

#get ship data
cursor.execute("SELECT project_id, gross_tonnage FROM projects")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# set real counts
cursor.execute("UPDATE progress SET y_true=0")
cursor.commit()
cursor.execute(
    " update area_progress set y_true=t2.reals from"
    " (SELECT project_id, area, pm, count(*) as reals from area_dataset"
    " GROUP BY project_id, area, pm ) t2"
    " WHERE area_progress.mt=t2.pm AND area_progress.project_id=t2.project_id AND area_progress.area=t2.area"
)
cursor.commit()
# normalize values
cursor.execute(
    "UPDATE area_progress SET y_true=y_true/ysum"
    " FROM (SELECT sum(y_true) as ysum FROM area_progress GROUP BY project_id, area) sums"
)
cursor.commit()


# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):

    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    # set mean predictions
    cursor.execute("UPDATE progress SET pred=0")
    cursor.commit()
    cursor.execute(
        " update area_progress set pred=CASE WHEN avg_count IS NULL THEN 0 ELSE avg_count END from ("
        " SELECT pm, avg(reals) avg_count FROM"
        " (SELECT area, pm, count(*) as reals from area_dataset"
        " WHERE project_id<>{} AND project_id<>{}"
        " GROUP BY project_id, area, pm ) t2  GROUP BY pm) t3"
        " WHERE t3.pm=area_progress.mt"
        .format(s1, s2)
    )
    cursor.commit()
    # normalize values
    cursor.execute(
        "UPDATE area_progress SET pred=pred/predsum"
        " FROM (SELECT sum(pred) as predsum FROM area_progress GROUP BY project_id, area) sums"
    )
    cursor.commit()

    # get data
    cursor.execute(
        "SELECT pred, y_true FROM area_progress"
        " WHERE project_id={} OR project_id={}"
        .format(s1, s2)
    )
    data = np.array(cursor.fetchall()).astype('float32')
    n = data.shape[0]/101 # number of areas
    preds = data[:, 0]
    y_test = data[:, -1]

    # evaluate
    score = np.abs((y_test - preds)).sum()/(2*n)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"37,9%",,"6,5%",,"47,3%",,"23,5%",


In [19]:
# Whole ship cabling progress (weekly)
# Predictors:
# Nothing
# Algorithms:
# Mean

#get ship data
cursor.execute("SELECT project_id, gross_tonnage FROM projects")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# set real counts
cursor.execute("UPDATE progress SET y_true=0")
cursor.commit()
cursor.execute(
    " update area_progress set y_true=t2.reals from"
    " (SELECT project_id, area, rw, count(*) as reals from area_dataset"
    " WHERE category='Y'"
    " GROUP BY project_id, area, rw ) t2"
    " WHERE area_progress.wk=t2.rw AND area_progress.project_id=t2.project_id AND area_progress.area=t2.area"
)
cursor.commit()
# normalize values
cursor.execute(
    "UPDATE area_progress SET y_true=y_true/ysum"
    " FROM (SELECT sum(y_true) as ysum FROM area_progress GROUP BY project_id, area) sums"
)
cursor.commit()


# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):

    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    # set mean predictions
    cursor.execute("UPDATE progress SET pred=0")
    cursor.commit()
    cursor.execute(
        " update area_progress set pred=CASE WHEN avg_count IS NULL THEN 0 ELSE avg_count END from ("
        " SELECT rw, avg(reals) avg_count FROM"
        " (SELECT area, rw, count(*) as reals from area_dataset"
        " WHERE project_id<>{} AND project_id<>{} AND category='Y'"
        " GROUP BY project_id, area, rw ) t2  GROUP BY rw) t3"
        " WHERE t3.rw=area_progress.wk"
        .format(s1, s2)
    )
    cursor.commit()
    # normalize values
    cursor.execute(
        "UPDATE area_progress SET pred=pred/predsum"
        " FROM (SELECT sum(pred) as predsum FROM area_progress GROUP BY project_id, area) sums"
    )
    cursor.commit()

    # get data
    cursor.execute(
        "SELECT pred, y_true FROM area_progress"
        " WHERE project_id={} OR project_id={}"
        .format(s1, s2)
    )
    data = np.array(cursor.fetchall()).astype('float32')
    n = data.shape[0]/101 # number of areas
    preds = data[:, 0]
    y_test = data[:, -1]

    # evaluate
    score = np.abs((y_test - preds)).sum()/(2*n)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"37,3%",,"4,6%",,"46,6%",,"28,5%",


In [20]:
# Whole ship cabling progress (monthly)
# Predictors:
# Nothing
# Algorithms:
# Mean

#get ship data
cursor.execute("SELECT project_id, gross_tonnage FROM projects")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# set real counts
cursor.execute("UPDATE progress SET y_true=0")
cursor.commit()
cursor.execute(
    " update area_progress set y_true=t2.reals from"
    " (SELECT project_id, area, rm, count(*) as reals from area_dataset"
    " WHERE category='Y'"
    " GROUP BY project_id, area, rm ) t2"
    " WHERE area_progress.mt=t2.rm AND area_progress.project_id=t2.project_id AND area_progress.area=t2.area"
)
cursor.commit()
# normalize values
cursor.execute(
    "UPDATE area_progress SET y_true=y_true/ysum"
    " FROM (SELECT sum(y_true) as ysum FROM area_progress GROUP BY project_id, area) sums"
)
cursor.commit()


# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
scores = []
for train_i, test_i in lpo.split(ships):

    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]

    # set mean predictions
    cursor.execute("UPDATE progress SET pred=0")
    cursor.commit()
    cursor.execute(
        " update area_progress set pred=CASE WHEN avg_count IS NULL THEN 0 ELSE avg_count END from ("
        " SELECT rm, avg(reals) avg_count FROM"
        " (SELECT area, rm, count(*) as reals from area_dataset"
        " WHERE project_id<>{} AND project_id<>{} AND category='Y'"
        " GROUP BY project_id, area, rm ) t2  GROUP BY rm) t3"
        " WHERE t3.rm=area_progress.mt"
        .format(s1, s2)
    )
    cursor.commit()
    # normalize values
    cursor.execute(
        "UPDATE area_progress SET pred=pred/predsum"
        " FROM (SELECT sum(pred) as predsum FROM area_progress GROUP BY project_id, area) sums"
    )
    cursor.commit()

    # get data
    cursor.execute(
        "SELECT pred, y_true FROM area_progress"
        " WHERE project_id={} OR project_id={}"
        .format(s1, s2)
    )
    data = np.array(cursor.fetchall()).astype('float32')
    n = data.shape[0]/101 # number of areas
    preds = data[:, 0]
    y_test = data[:, -1]

    # evaluate
    score = np.abs((y_test - preds)).sum()/(2*n)
    scores.append(score)

scores = np.array(scores)
table_rows.append(('Mean', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"42,1%",,"5,1%",,"51,2%",,"31,0%",
