In [1]:
import pyodbc
from meyerDB import cable_connection
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import sklearn.preprocessing as pp
from sklearn.model_selection import GridSearchCV
from copy import deepcopy

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

Database connection ok


In [2]:
# Whole ship cable quantities
# Predictors:
# GT
# Algorithms:
# Linear regression
# KNN regression
# Mean

# Get the data
cursor.execute("SELECT gross_tonnage, sum(amount) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, gross_tonnage")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, 0].reshape((-1, 1))
print('Features', X.shape)
y = data[:, -1]


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()
models['KNN Regression'] = KNeighborsRegressor()
models['Mean'] = DummyRegressor(strategy='mean')

scaler = pp.MinMaxScaler()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'min', 'max']]

for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        if model_name == 'KNN Regression':
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']}, cv=3)
            X_train = deepcopy(scaler.fit_transform(X[train_i]))
            clf.fit(X_train, y[train_i], )
            X_test = deepcopy(scaler.transform(X[test_i])) #normalize features
            preds = clf.predict(X_test)
            scores.append(mape(y[test_i], preds))
        else:
            model.fit(X[train_i], y[train_i])
            preds = model.predict(X[test_i])
            scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, scores.mean(), scores.std(), scores.min(), scores.max()))
    with open("static_whole_ship_cable_quantities.txt", "a") as file_object:
        if model_name == 'Mean':
            line = '-;Whole ship cable quantity;m;{};{};{};{};{}\n'.format(model_name, scores.mean(), scores.std(), scores.min(), scores.max())
        else:
            line = 'GT;Whole ship cable quantity;m;{};{};{};{};{}\n'.format(model_name, scores.mean(), scores.std(), scores.min(), scores.max())
        file_object.write(line)
display_table(table_rows)


Data array:  (11, 2)
Features (11, 1)


0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,min,,max,
Linear regression,,"7,1%",,"5,7%",,"1,2%",,"20,5%",
KNN Regression,,"7,4%",,"16,2%",,"0,4%",,"115,8%",
Mean,,"56,1%",,"41,0%",,"12,0%",,"196,3%",


In [3]:
# Whole ship cable quantities
# Predictors:
# Ship squares
# Algorithms:
# Linear regression
# KNN regression

# Get the data
cursor.execute("SELECT ship_squares , sum(amount) FROM routed as r"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON a.project_id=r.project_id"
    " GROUP BY r.project_id, ship_squares")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, 0].reshape((-1, 1))
print('Features', X.shape)
y = data[:, -1]


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()
models['KNN Regression'] = KNeighborsRegressor()

scaler = pp.MinMaxScaler()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'min', 'max']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        if model_name == 'KNN Regression':
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']}, cv=3)
            X_train = scaler.fit_transform(X[train_i])
            clf.fit(X_train, y[train_i])
            #print(clf.best_params_)
            X_test = scaler.transform(X[test_i]) #normalize features
            preds = clf.predict(X_test)
            scores.append(mape(y[test_i], preds))
        else:
            model.fit(X[train_i], y[train_i])
            preds = model.predict(X[test_i])
            scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, scores.mean(), scores.std(), scores.min(), scores.max()))
    with open("static_whole_ship_cable_quantities.txt", "a") as file_object:
        line = 'Ship squares;Whole ship cable quantity;m;{};{};{};{};{}\n'.format(model_name, scores.mean(), scores.std(), scores.min(), scores.max())
        file_object.write(line)
display_table(table_rows)



Data array:  (11, 2)
Features (11, 1)


0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,min,,max,
Linear regression,,"11,0%",,"11,3%",,"0,7%",,"50,2%",
KNN Regression,,"6,7%",,"15,4%",,"0,4%",,"109,1%",


In [4]:
# Whole ship cable quantities
# Predictors:
# GT
# Ship squares
# Algorithms:
# Linear regression
# KNN regression


# Get the data
cursor.execute("SELECT gross_tonnage, ship_squares , sum(amount) FROM routed as r"
    " LEFT JOIN (SELECT project_id, sum(squares) as ship_squares FROM areas GROUP BY project_id) as a ON a.project_id=r.project_id"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, ship_squares, gross_tonnage")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, [0, 1]]
print('Features', X.shape)
y = data[:, -1]


# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()
models['KNN Regression'] = KNeighborsRegressor()

scaler = pp.MinMaxScaler()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'min', 'max']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        if model_name == 'KNN Regression':
            clf = GridSearchCV(model, {'n_neighbors':[1, 3, 5], 'weights': ['uniform', 'distance']}, cv=3)
            X_train = scaler.fit_transform(X[train_i])
            clf.fit(X_train, y[train_i])
            #print(clf.best_params_)
            X_test = scaler.transform(X[test_i]) #normalize features
            preds = clf.predict(X_test)
            scores.append(mape(y[test_i], preds))
        else:
            model.fit(X[train_i], y[train_i])
            preds = model.predict(X[test_i])
            scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, scores.mean(), scores.std(), scores.min(), scores.max()))
    with open("static_whole_ship_cable_quantities.txt", "a") as file_object:
        line = 'GT, Ship squares;Whole ship cable quantity;m;{};{};{};{};{}\n'.format(model_name, scores.mean(), scores.std(), scores.min(), scores.max())
        file_object.write(line)
display_table(table_rows)


Data array:  (11, 3)
Features (11, 2)


0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,min,,max,
Linear regression,,"6,4%",,"3,7%",,"1,1%",,"18,2%",
KNN Regression,,"9,2%",,"16,4%",,"0,4%",,"115,2%",


In [11]:
# Whole ship cable quantities (pcs, for comparison)
# Predictors:
# GT
# Algorithms:
# Linear regression

# Get the data
cursor.execute("SELECT gross_tonnage, count(*) FROM routed as r"
    " LEFT JOIN projects as p ON p.project_id=r.project_id"
    " GROUP BY r.project_id, gross_tonnage")
data = np.array(cursor.fetchall()).astype('float32')
print('Data array: ', data.shape)
X = data[:, 0].reshape((-1, 1))
print('Features', X.shape)
y = data[:, -1]


# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# init regression model
models = dict()
models['Linear regression'] = linear_model.LinearRegression()

# train and evaluate all models with lpo
table_rows = [['Model', 'mean', 'std', 'min', 'max']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(X):
        model.fit(X[train_i], y[train_i])
        preds = model.predict(X[test_i])
        scores.append(mape(y[test_i], preds))
    scores = np.array(scores)
    table_rows.append((model_name, scores.mean(), scores.std(), scores.min(), scores.max()))
    with open("static_whole_ship_cable_quantities.txt", "a") as file_object:
        line = 'GT;Whole ship cable quantity (for comparison);pcs;{};{};{};{};{}\n'.format(model_name, scores.mean(), scores.std(), scores.min(), scores.max())
        file_object.write(line)
display_table(table_rows)


Data array:  (11, 2)
Features (11, 1)


0,1,2,3,4,5,6,7,8,9
Model,,mean,,std,,min,,max,
Linear regression,,"8,3%",,"5,5%",,"0,6%",,"27,0%",
