In [None]:
import pyodbc
from meyerDB import cable_connection
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, display
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
import sklearn.preprocessing as pp

# init db connection
conn = pyodbc.connect(cable_connection)
cursor = conn.cursor()
print('Database connection ok')

def display_table(data):
    html = "<table>"
    for row in data:
        html += "<tr>"
        for field in row:
            try:
                value = str(round(100*field, 1)).replace('.', ',') + '%'
            except:
                value = field
            html += "<td><h4>%s</h4><td>"%(value)
        html += "</tr>"
    html += "</table>"
    display(HTML(html))

In [None]:
# Whole ship cabling readiness
# Predictors:
# GT
# Algorithms:
# K-NN Regression

# Get the data

# get area types
cursor.execute("SELECT DISTINCT area_type FROM areas")
area_types = [item for t in cursor.fetchall() for item in t]

#get ship data
cursor.execute("SELECT project_id, sum(amount) FROM routed GROUP BY project_id")
data = np.array(cursor.fetchall()).astype('int32')
ships = data[:, 0]


# init regression models
models = dict()
models['1-NN Regression'] = KNeighborsRegressor(1, weights='distance')
models['3-NN Regression'] = KNeighborsRegressor(3, weights='distance')

# init lpo split
p = 2 #ships
lpo = LeavePOut(p)

# train and evaluate models with lpo
table_rows = [['Model', 'mean', 'std', 'max', 'min']]
for model_name in models.keys():
    model = models[model_name]
    scores = []
    for train_i, test_i in lpo.split(ships):
        # update area squares for training data
        cursor.execute("UPDATE areas SET pred_count = NULL")
        cursor.commit()
        s1 = ships[test_i[0]]
        s2 = ships[test_i[1]]
        preds = np.array([])
        y_test = np.array([])
        for area_type in area_types:
            # get training data
            cursor.execute(
                " SELECT squares, cable_count FROM ("
                " SELECT area_dataset.project_id, area_dataset.area, area_dataset.area_type as area_type, areas.squares as squares, count(*) as cable_count"
                " FROM area_dataset"
                " LEFT JOIN areas ON areas.project_id=area_dataset.project_id AND areas.area=area_dataset.area"
                " WHERE area_dataset.project_id <> {} AND area_dataset.project_id <> {}"
                " GROUP BY area_dataset.project_id, area_dataset.area, area_dataset.area_type, squares"
                " ) cable_counts WHERE area_type='{}' AND squares IS NOT NULL"
                .format(s1,s2,area_type)
            )
            data = np.array(cursor.fetchall()).astype('float32')
            if data.shape[0] < model.n_neighbors:
                continue
            X_train = data[:, [0]]
            y_train = data[:, -1]
            # normalize features
            scaler = pp.MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            # fit model
            model.fit(X_train, y_train)
            # get test data
            cursor.execute(
                " SELECT squares, cable_count FROM ("
                " SELECT area_dataset.project_id, area_dataset.area, area_dataset.area_type as area_type, areas.squares as squares, count(*) as cable_count"
                " FROM area_dataset"
                " LEFT JOIN areas ON areas.project_id=area_dataset.project_id AND areas.area=area_dataset.area"
                " WHERE area_dataset.project_id = {} OR area_dataset.project_id = {}"
                " GROUP BY area_dataset.project_id, area_dataset.area, area_dataset.area_type, squares"
                " ) cable_counts WHERE area_type='{}' AND squares IS NOT NULL"
                .format(s1,s2,area_type)
            )
            data = np.array(cursor.fetchall()).astype('float32')
            if data.shape[0] < model.n_neighbors:
                continue
            X_test = data[:, [0]]
            y = data[:, -1]
            # normalize features
            X_test = scaler.transform(X_test) #normalize features
            # predict
            preds = np.concatenate((preds, model.predict(X_test)))
            y_test = np.concatenate((y_test, y))


        # calculate area predictions cv score
        if y_test.shape[0] > 0:
            score = mape(y_test, preds)
            scores.append(score)

    scores = np.array(scores)
    table_rows.append((model_name, 1-scores.mean(), scores.std(), 1-scores.min(), 1-scores.max()))
display_table(table_rows)