In [2]:
import pyodbc
from meyerDB import cable_connection
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import sklearn.preprocessing as pp
from sklearn.model_selection import GridSearchCV

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

Database connection ok


In [3]:
# Whole ship cable quantities
# Predictors:
# GT
# Algorithms:
# Linear regression
# KNN regression
# Mean

# Get the data
cursor.execute("SELECT gross_tonnage, sum(amount) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, gross_tonnage")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, 0].reshape((-1, 1))
print('Features', X.shape)
y = data[:, -1]


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()
models['NN Regression'] = KNeighborsRegressor()
models['Mean'] = DummyRegressor(strategy='mean')

scaler = pp.MinMaxScaler()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        if model_name == 'NN Regression':
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']})
            X_train = scaler.fit_transform(X[train_i])
            clf.fit(X_train, y[train_i])
            #print(clf.best_params_)
            X_test = scaler.transform(X[test_i]) #normalize features
            preds = clf.predict(X_test)
            scores.append(mape(y[test_i], preds))
        else:
            model.fit(X[train_i], y[train_i])
            preds = model.predict(X[test_i])
            scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, scores.mean(), scores.std(), scores.min(), scores.max()))
display_table(table_rows)


Data array:  (11, 2)
Features (11, 1)




0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Linear regression,,"7,1%",,"5,7%",,"1,2%",,"20,5%",
NN Regression,,"4,8%",,"15,2%",,"0,4%",,"115,8%",
Mean,,"56,1%",,"41,0%",,"12,0%",,"196,3%",


In [41]:
# Whole ship cable quantities
# Predictors:
# Ship squares
# Algorithms:
# Linear regression
# KNN regression

# Get the data
cursor.execute("SELECT ship_squares , sum(amount) FROM routed as r"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON a.project_id=r.project_id"
    " GROUP BY r.project_id, ship_squares")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, 0].reshape((-1, 1))
print('Features', X.shape)
y = data[:, -1]


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()
models['NN Regression'] = KNeighborsRegressor()

scaler = pp.MinMaxScaler()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        if model_name == 'NN Regression':
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']})
            X_train = scaler.fit_transform(X[train_i])
            clf.fit(X_train, y[train_i])
            #print(clf.best_params_)
            X_test = scaler.transform(X[test_i]) #normalize features
            preds = clf.predict(X_test)
            scores.append(mape(y[test_i], preds))
        else:
            model.fit(X[train_i], y[train_i])
            preds = model.predict(X[test_i])
            scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)



Data array:  (11, 2)
Features (11, 1)




0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Linear regression,,"89,0%",,"11,3%",,"99,3%",,"49,8%",
NN Regression,,"94,9%",,"14,3%",,"99,6%",,"-9,1%",


In [44]:
# Whole ship cable quantities
# Predictors:
# GT
# Ship squares
# Algorithms:
# Linear regression
# KNN regression


# Get the data
cursor.execute("SELECT gross_tonnage, ship_squares , sum(amount) FROM routed as r"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON a.project_id=r.project_id"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, ship_squares, gross_tonnage")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, [0, 1]]
print('Features', X.shape)
y = data[:, -1]


# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()
models['NN Regression'] = KNeighborsRegressor()

scaler = pp.MinMaxScaler()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        if model_name == 'NN Regression':
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']})
            X_train = scaler.fit_transform(X[train_i])
            clf.fit(X_train, y[train_i])
            #print(clf.best_params_)
            X_test = scaler.transform(X[test_i]) #normalize features
            preds = clf.predict(X_test)
            scores.append(mape(y[test_i], preds))
        else:
            model.fit(X[train_i], y[train_i])
            preds = model.predict(X[test_i])
            scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)


Data array:  (11, 3)
Features (11, 2)




0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Linear regression,,"93,6%",,"3,7%",,"98,9%",,"81,8%",
NN Regression,,"94,8%",,"15,2%",,"99,6%",,"-15,8%",


In [4]:
# Whole ship cable quantities
# Predictors:
# Area type (and list of areas in the target ship)
# Algorithms:
# Mean

# Get the data
cursor.execute("SELECT DISTINCT project_id FROM routed")
data = np.array(cursor.fetchall()).astype('int32')
X = data[:, 0]
# init lpo split
p = 2 #ships
lpo = LeavePOut(p)
# train and evaluate model with lpo
scores = []
for train_i, test_i in lpo.split(X):
    s1 = X[test_i[0]]
    s2 = X[test_i[1]]
    cursor.execute(
        "SELECT predictions.cable_count as pred, sum(amount) as real_count FROM ("
        " SELECT project_id, sum(avg_count) as cable_count FROM areas LEFT JOIN ("
        " SELECT area_type, avg(area_cables) as avg_count FROM ("
        " SELECT project_id, area, sum(amount)/2 as area_cables FROM ("
        " SELECT project_id, start_area as area, sum(amount) as amount FROM routed"
        " WHERE project_id <> {} AND project_id <> {}"
        " GROUP BY start_area, project_id"
        " UNION"
        " SELECT project_id, end_area as area, sum(amount) as amount FROM routed"
        " WHERE project_id <> {} AND project_id <> {}"
        " GROUP BY end_area, project_id"
        " ) area_union GROUP BY project_id, area"
        " ) t_sums LEFT JOIN areas ON areas.project_id=t_sums.project_id AND areas.area=t_sums.area"
        " GROUP BY area_type"
        " ) avg_counts ON avg_counts.area_type=areas.area_type"
        " WHERE project_id={} OR project_id={}"
        " GROUP BY project_id"
        " ) predictions LEFT JOIN routed ON predictions.project_id=routed.project_id"
        " GROUP BY predictions.project_id, predictions.cable_count"
        .format(s1,s2,s1,s2,s1,s2)
    )
    data = np.array(cursor.fetchall())
    preds = data[:, 0]
    y_test = data[:, 1]
    scores.append(mape(y_test, preds))
    
scores = np.array(scores)
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
table_rows.append(('Mean', scores.mean(), scores.std(), scores.min(), scores.max()))
display_table(table_rows)



0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Mean,,"13,2%",,"9,3%",,"1,3%",,"35,6%",


In [49]:
# Whole ship cable quantities
# Predictors:
# Area squares
# Area type (and list of areas in the target ship)
# Algorithms:
# Linear regression

# Get the data

# get area types
cursor.execute("SELECT DISTINCT area_type FROM areas")
area_types = [item for t in cursor.fetchall() for item in t]

#get ship data
cursor.execute("SELECT project_id FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init regression model
model = linear_model.LinearRegression()

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)
# train and evaluate model with lpo
scores = []
for train_i, test_i in lpo.split(ships):
    # update area squares linear model coefficients and predictions
    cursor.execute("UPDATE areas SET pred_count = NULL")
    cursor.commit()
    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]
    for area_type in area_types:
        cursor.execute(
            " SELECT squares, cable_count FROM ("
            " SELECT area_union.project_id, area_union.area, area_type, squares, sum(amount)/2 as cable_count FROM ("
            " SELECT project_id, start_area as area, sum(amount) as amount FROM routed"
            " WHERE project_id <> {} AND project_id <> {}"
            " GROUP BY start_area, project_id"
            " UNION"
            " SELECT project_id, end_area as area, sum(amount) as amount FROM routed"
            " WHERE project_id <> {} AND project_id <> {}"
            " GROUP BY end_area, project_id"
            " ) area_union LEFT JOIN areas ON areas.project_id=area_union.project_id AND areas.area=area_union.area"
            " GROUP BY area_union.project_id, area_union.area, area_type, squares"
            " ) cable_counts WHERE area_type='{}' AND squares IS NOT NULL"
            .format(s1,s2,s1,s2,area_type)
        )
        data = np.array(cursor.fetchall()).astype('float32')
        if data.shape[0] == 0:
            continue
        X = data[:, 0].reshape((-1, 1))
        y = data[:, 1]
        model.fit(X, y)
        cursor.execute("UPDATE areas SET pred_count=squares*{} + {} WHERE area_type='{}'".format(model.coef_[0], model.intercept_, area_type))
        cursor.commit()
    # calculate ship wide predictions
    cursor.execute(
        "SELECT sum(pred_count) as preds, real_count FROM areas"
        " LEFT JOIN (SELECT project_id, sum(amount) as real_count FROM routed GROUP BY project_id) as real_counts ON real_counts.project_id=areas.project_id"
        " WHERE areas.project_id={} OR areas.project_id={} GROUP BY areas.project_id, real_count"
        .format(s1, s2)
        )
    data = np.array(cursor.fetchall()).astype('float32')
    preds = data[:, 0]
    y_test = data[:, 1]
    scores.append(mape(y_test, preds))

scores = np.array(scores)
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
table_rows.append(('Linear regression', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)



0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Linear regression,,"90,3%",,"6,7%",,"98,8%",,"70,9%",


In [3]:
# Whole ship cable quantities
# Predictors:
# Area squares
# GT
# Area type (and list of areas in the target ship)
# Algorithms:
# Linear regression

# Get the data

# get area types
cursor.execute("SELECT DISTINCT area_type FROM areas")
area_types = [item for t in cursor.fetchall() for item in t]

#get ship data
cursor.execute("SELECT project_id FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]

# init regression model
model = linear_model.LinearRegression()


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)
# train and evaluate model with lpo
scores = []
for train_i, test_i in lpo.split(ships):
    # update area squares linear model coefficients and predictions
    cursor.execute("UPDATE areas SET pred_count = NULL")
    cursor.commit()
    s1 = ships[test_i[0]]
    s2 = ships[test_i[1]]
    for area_type in area_types:
        cursor.execute(
            " SELECT gross_tonnage, squares, cable_count FROM ("
            " SELECT area_union.project_id as proj, area_union.area, area_type, squares, sum(amount)/2 as cable_count FROM ("
            " SELECT project_id, start_area as area, sum(amount) as amount FROM routed"
            " WHERE project_id <> {} AND project_id <> {}"
            " GROUP BY start_area, project_id"
            " UNION"
            " SELECT project_id, end_area as area, sum(amount) as amount FROM routed"
            " WHERE project_id <> {} AND project_id <> {}"
            " GROUP BY end_area, project_id"
            " ) area_union LEFT JOIN areas ON areas.project_id=area_union.project_id AND areas.area=area_union.area"
            " GROUP BY area_union.project_id, area_union.area, area_type, squares"
            " ) cable_counts LEFT JOIN projects ON projects.project_id=proj"
            " WHERE area_type='{}' AND squares IS NOT NULL"
            .format(s1,s2,s1,s2,area_type)
        )
        data = np.array(cursor.fetchall()).astype('float32')
        if data.shape[0] == 0:
            continue
        X = data[:, [0, 1]]
        y = data[:, -1]
        model.fit(X, y)
        cursor.execute(
            "UPDATE areas SET areas.pred_count=projects.gross_tonnage*{} + areas.squares*{} + {}"
            " FROM projects"
            " WHERE areas.project_id=projects.project_id AND areas.area_type='{}'"
            .format(model.coef_[0], model.coef_[1], model.intercept_, area_type)
            )
        cursor.commit()
    # calculate ship wide predictions
    cursor.execute(
        "SELECT sum(pred_count) as preds, real_count FROM areas"
        " LEFT JOIN (SELECT project_id, sum(amount) as real_count FROM routed GROUP BY project_id) as real_counts ON real_counts.project_id=areas.project_id"
        " WHERE areas.project_id={} OR areas.project_id={} GROUP BY areas.project_id, real_count"
        .format(s1, s2)
        )
    data = np.array(cursor.fetchall()).astype('float32')
    preds = data[:, 0]
    y_test = data[:, 1]
    scores.append(mape(y_test, preds))

scores = np.array(scores)
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
table_rows.append(('Linear regression', 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)



0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Linear regression,,"93,7%",,"5,1%",,"99,5%",,"80,6%",


In [47]:
# Whole ship cable quantities
# Predictors:
# Area squares
# Area type (and list of areas in the target ship)
# Algorithms:
# K-NN Regression

# Get the data

# get area types
cursor.execute("SELECT DISTINCT area_type FROM areas")
area_types = [item for t in cursor.fetchall() for item in t]

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
values = data[:, 1]
zips = zip(ships, values)
cable_counts = dict(zips)

# init regression models
models = dict()
models['KNN Regression'] = KNeighborsRegressor()


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(ships):
        # update area squares for training data
        cursor.execute("UPDATE areas SET pred_count = NULL")
        cursor.commit()
        s1 = ships[test_i[0]]
        s2 = ships[test_i[1]]
        pred_counts = {s1:0, s2:0}
        for area_type in area_types:
            cursor.execute(
                " SELECT squares, cable_count FROM ("
                " SELECT area_union.project_id, area_union.area, area_type, squares, sum(amount)/2 as cable_count FROM ("
                " SELECT project_id, start_area as area, sum(amount) as amount FROM routed"
                " WHERE project_id <> {} AND project_id <> {}"
                " GROUP BY start_area, project_id"
                " UNION"
                " SELECT project_id, end_area as area, sum(amount) as amount FROM routed"
                " WHERE project_id <> {} AND project_id <> {}"
                " GROUP BY end_area, project_id"
                " ) area_union LEFT JOIN areas ON areas.project_id=area_union.project_id AND areas.area=area_union.area"
                " GROUP BY area_union.project_id, area_union.area, area_type, squares"
                " ) cable_counts WHERE area_type='{}' AND squares IS NOT NULL"
                .format(s1,s2,s1,s2,area_type)
            )
            data = np.array(cursor.fetchall()).astype('float32')
            if data.shape[0] == 0:
                continue
            X_train = data[:, 0].reshape((-1, 1))
            y_train = data[:, 1]

            # normalize features
            scaler = pp.MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            if X_train.shape[0] < 5:
                continue
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']})
            clf.fit(X_train, y_train)
            # calculate ship predictions
            for ship in [s1, s2]:
                cursor.execute("SELECT squares FROM areas WHERE area_type='{}' AND project_id={} AND squares IS NOT NULL".format(area_type, ship))
                data = np.array(cursor.fetchall()).astype('float32')
                if data.shape[0] < clf.best_estimator_.n_neighbors: #check if enough samples
                    continue
                X_test = data[:, 0].reshape(-1, 1)
                X_test = scaler.transform(X_test) #normalize features
                preds = clf.predict(X_test)
                pred_counts[ship] += preds.sum()

        # calculate ship wide predictions and cv score
        y_test = []
        preds = []
        for ship in [s1, s2]:
            y_test.append(cable_counts[ship])
            preds.append(pred_counts[ship])
        score = mape(y_test, preds)
        scores.append(score)

    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)



Traceback (most recent call last):
  File "C:\Users\sainmi53\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\sainmi53\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 397, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "C:\Users\sainmi53\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 553, in score
    y_pred = self.predict(X)
  File "C:\Users\sainmi53\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\neighbors\_regression.py", line 208, in predict
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "C:\Users\sainmi53\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\neighbors\_base.py", line 680, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 5

Tracebac

0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
KNN Regression,,"86,1%",,"5,4%",,"94,7%",,"75,4%",


In [7]:
# Whole ship cable quantities
# Predictors:
# GT
# Area squares
# Area type (and list of areas in the target ship)
# Algorithms:
# K-NN Regression

# Get the data

# get area types
cursor.execute("SELECT DISTINCT area_type FROM areas")
area_types = [item for t in cursor.fetchall() for item in t]

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
values = data[:, 1]
zips = zip(ships, values)
cable_counts = dict(zips)

# init regression models
models = dict()
models['1-NN Regression'] = KNeighborsRegressor(1, weights='distance')
models['3-NN Regression'] = KNeighborsRegressor(3, weights='distance')

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(ships):
        # update area squares for training data
        cursor.execute("UPDATE areas SET pred_count = NULL")
        cursor.commit()
        s1 = ships[test_i[0]]
        s2 = ships[test_i[1]]
        pred_counts = {s1:0, s2:0}
        for area_type in area_types:
            cursor.execute(
                " SELECT gross_tonnage, squares, cable_count FROM ("
                " SELECT area_union.project_id, area_union.area, area_type, squares, sum(amount)/2 as cable_count FROM ("
                " SELECT project_id, start_area as area, sum(amount) as amount FROM routed"
                " WHERE project_id <> {} AND project_id <> {}"
                " GROUP BY start_area, project_id"
                " UNION"
                " SELECT project_id, end_area as area, sum(amount) as amount FROM routed"
                " WHERE project_id <> {} AND project_id <> {}"
                " GROUP BY end_area, project_id"
                " ) area_union LEFT JOIN areas ON areas.project_id=area_union.project_id AND areas.area=area_union.area"
                " GROUP BY area_union.project_id, area_union.area, area_type, squares"
                " ) cable_counts "
                " LEFT JOIN projects ON projects.project_id=cable_counts.project_id"
                " WHERE area_type='{}' AND squares IS NOT NULL"
                .format(s1,s2,s1,s2,area_type)
            )
            data = np.array(cursor.fetchall()).astype('float32')
            if data.shape[0] == 0:
                continue
            X_train = data[:, [0, 1]]
            y_train = data[:, -1]

            # normalize features
            scaler = pp.MinMaxScaler()
            X_train = scaler.fit_transform(X_train)

            model.fit(X_train, y_train)
            # calculate ship predictions
            for ship in [s1, s2]:
                cursor.execute(
                    "SELECT gross_tonnage, squares FROM areas "
                    " LEFT JOIN projects ON projects.project_id=areas.project_id"
                    " WHERE area_type='{}' AND areas.project_id={} AND squares IS NOT NULL"
                    .format(area_type, ship)
                    )
                data = np.array(cursor.fetchall()).astype('float32')
                if data.shape[0] < model.n_neighbors: #check if enough samples
                    continue
                X_test = data[:, [0, 1]]
                X_test = scaler.transform(X_test) #normalize features
                preds = model.predict(X_test)
                pred_counts[ship] += preds.sum()

        # calculate ship wide predictions and cv score
        y_test = []
        preds = []
        for ship in [s1, s2]:
            y_test.append(cable_counts[ship])
            preds.append(pred_counts[ship])
        score = mape(y_test, preds)
        scores.append(score)

    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)



0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
1-NN Regression,,"8,7%",,"0,8%",,"9,5%",,"6,7%",
3-NN Regression,,"8,3%",,"0,8%",,"9,2%",,"6,3%",


In [3]:
# Whole ship cable quantities (pcs, for comparison)
# Predictors:
# GT
# Algorithms:
# Linear regression

# Get the data
cursor.execute("SELECT gross_tonnage, count(*) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, gross_tonnage")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, 0].reshape((-1, 1))
print('Features', X.shape)
y = data[:, -1]


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        model.fit(X[train_i], y[train_i])
        preds = model.predict(X[test_i])
        scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)


Data array:  (11, 2)
Features (11, 1)


0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
Linear regression,,"91,7%",,"5,5%",,"99,4%",,"73,0%",


In [4]:
# Whole ship cable quantities (pcs, for comparison)
# Predictors:
# Area squares
# Area type (and list of areas in the target ship)
# Algorithms:
# K-NN Regression

# Get the data

# get area types
cursor.execute("SELECT DISTINCT area_type FROM areas")
area_types = [item for t in cursor.fetchall() for item in t]

#get ship data
cursor.execute("SELECT project_id, count(*) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]
values = data[:, 1]
zips = zip(ships, values)
cable_counts = dict(zips)

# init regression models
models = dict()
models['1-NN Regression'] = KNeighborsRegressor(1, weights='distance')
#models['3-NN Regression'] = KNeighborsRegressor(3, weights='distance')

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(ships):
        # update area squares for training data
        cursor.execute("UPDATE areas SET pred_count = NULL")
        cursor.commit()
        s1 = ships[test_i[0]]
        s2 = ships[test_i[1]]
        pred_counts = {s1:0, s2:0}
        for area_type in area_types:
            cursor.execute(
                " SELECT squares, cable_count FROM ("
                " SELECT area_union.project_id, area_union.area, area_type, squares, sum(amount)/2 as cable_count FROM ("
                " SELECT project_id, start_area as area, count(*) as amount FROM routed"
                " WHERE project_id <> {} AND project_id <> {}"
                " GROUP BY start_area, project_id"
                " UNION"
                " SELECT project_id, end_area as area, count(*) as amount FROM routed"
                " WHERE project_id <> {} AND project_id <> {}"
                " GROUP BY end_area, project_id"
                " ) area_union LEFT JOIN areas ON areas.project_id=area_union.project_id AND areas.area=area_union.area"
                " GROUP BY area_union.project_id, area_union.area, area_type, squares"
                " ) cable_counts WHERE area_type='{}' AND squares IS NOT NULL"
                .format(s1,s2,s1,s2,area_type)
            )
            data = np.array(cursor.fetchall()).astype('float32')
            if data.shape[0] == 0:
                continue
            X_train = data[:, 0].reshape((-1, 1))
            y_train = data[:, 1]

            # normalize features
            scaler = pp.MinMaxScaler()
            X_train = scaler.fit_transform(X_train)

            model.fit(X_train, y_train)
            # calculate ship predictions
            for ship in [s1, s2]:
                cursor.execute("SELECT squares FROM areas WHERE area_type='{}' AND project_id={} AND squares IS NOT NULL".format(area_type, ship))
                data = np.array(cursor.fetchall()).astype('float32')
                if data.shape[0] < model.n_neighbors: #check if enough samples
                    continue
                X_test = data[:, 0].reshape(-1, 1)
                X_test = scaler.transform(X_test) #normalize features
                preds = model.predict(X_test)
                pred_counts[ship] += preds.sum()

        # calculate ship wide predictions and cv score
        y_test = []
        preds = []
        for ship in [s1, s2]:
            y_test.append(cable_counts[ship])
            preds.append(pred_counts[ship])
        score = mape(y_test, preds)
        scores.append(score)

    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)


0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,max,,min,
1-NN Regression,,"94,4%",,"3,3%",,"98,7%",,"86,1%",
