In [1]:
#@title
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#@title
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import sklearn as skl
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from scipy import ndimage

# changes data into Pytorch tensors, so they can be used for models
def createTrainTensors(train):
    # Train data
    np_train_data = train.to_numpy()
    x = torch.from_numpy(np_train_data.astype(np.float32))
    return x


def createParamsTensors(train, params):
    # Params data
    np_params_data = params.to_numpy()
    np_params_data_reshape = np_params_data.reshape(len(train.T), 1)
    # np_params_data_reshape = np_params_data.reshape(500, 55,1)

    z = torch.from_numpy(np_params_data_reshape)  # .astype)#(np.float32))

    return z


def createParamsNumpy(train, params):
    np_params_data = params.to_numpy()
    y = np_params_data.reshape(len(train.T), 1)
    # y = np_params_data.reshape(500, 55 ,1)

    return y


def createTestTensors(test):
    # Test data
    if len(test) > 0:
        np_test_data = test.to_numpy()
        x_test = torch.from_numpy(np_test_data.astype(np.float32))


#Calculates median for dataframe
def calculateMedianValue(data):

    #medianData = data.T
    #medianData = medianData.rolling(3).median()
    #medianData = medianData.T
    #medianData = medianData.fillna(1)

    medianData = ndimage.median_filter(data, 3)
    return medianData

#first 30 values are replaced by the value of 1, based on the baseline solution
def replaceFirstXValues(data):
    for i in range(10):
        data[i] = 1
    return data

#removes first x values from dataframe
def removeFirstValues(data, to_remove):
    if type(data) != np.ndarray:
        for i in range(to_remove):
            data = data.drop(labels=i, axis=1)

    else:
        data = data[:, to_remove:]

    return data
#normalizes values for each measurement(1-300)
def normalizeDataFrame(data_frame):

    values = data_frame  # .values
    values = values.T
    min_max_scaler = preprocessing.MinMaxScaler()
    scaled_values = min_max_scaler.fit_transform(values)
    scaled_values = scaled_values.T

    norm_data = pd.DataFrame(scaled_values)
    return norm_data

def replaceValuesOverOne(data_frame):
    cleaned_list = []
    for element in data_frame:
        for value in element:
            if value > 1:
                cleaned_list.append(1)
            else:
                cleaned_list.append(value)
    np_cleaned = np.array(cleaned_list)
    np_cleaned = np_cleaned.reshape(len(np_cleaned),1)
    return np_cleaned

def zScoreDataFrame(data_frame, mean_value, standard_deviation):
    if type(data_frame) != np.ndarray:
       data_frame = data_frame.to_numpy()

    #mean_value = np.mean(data_frame)
    #standard_deviation = np.std(data_frame)

    scaled_data_frame = (data_frame - mean_value) / standard_deviation
    #print(f"Rescaling data with mean = {mean_value}, std = {standard_deviation}")
    return scaled_data_frame

def rescaleData(data_frame, mean_value, standard_deviation):
    data_unscaled = data_frame * standard_deviation + mean_value
    return data_unscaled

def preprocessData(data_frame, median, replaceValuesBiggerOne, replaceFirstXValues, removeFirstXValues,
                   normalizeData, zScore, zScore_mean, zScore_sd, number_to_remove):

    if median == True:
        data_frame = calculateMedianValue(data_frame)
    if replaceValuesBiggerOne == True:
        data_frame = data_frame.apply(replaceValuesOverOne)
    if replaceFirstXValues == True:
        data_frame = replaceFirstXValues(data_frame)
    if removeFirstXValues == True:
        data_frame = removeFirstValues(data_frame, number_to_remove)
    if normalizeData == True:
        data_frame = normalizeDataFrame(data_frame)
    if zScore == True:
        data_frame = zScoreDataFrame(data_frame, zScore_mean, zScore_sd)

    return data_frame

def preprocessParams(data_frame):
    data_frame = normalizeDataFrame(data_frame)

    return data_frame

def preprocessParameters(data):
    zscore_scaler = StandardScaler()
    data = zscore_scaler.fit_transform(data)

    return data


In [3]:
#@title
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow import keras
from tensorflow.keras import layers, activations
from tensorflow.keras.layers import Dropout


# Implements scoring system for evaluation
def scoringSystem(y_true, y_pred):
    #print(type(y_true))

    weight = tf.ones_like(y_true)
    absolute = tf.math.abs(y_pred - y_true)
    # neuer stand
    #print("absolut: ", absolute)
    counter = tf.math.reduce_sum((weight * 2 * y_true * absolute))
    denominator = tf.math.reduce_sum(weight)
    print(denominator)

    formula = 1e4 - (counter / denominator) * 1e6

    return formula

class simpleRegression(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(simpleRegression, self).__init__()
        self.l1 = nn.Linear(input_size, output_size)

    def forward(self, x):
        out = self.l1(x)
        return out


class feedForward(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(feedForward, self).__init__()
        #self.flatten = nn.Flatten()
        self.l1 = nn.Linear(input_size, 150)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(150, output_size)

    def forward(self, x):
        #out = self.flatten(x)
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out


def sequentialModel(inputs, lr_rate):
    model = keras.Sequential([
        keras.layers.InputLayer((55, 300)),
        keras.layers.Flatten(),
       # layers.Activation(activations.selu),
       # layers.Dense(2048),
       # layers.Activation(activations.selu),
       # layers.Dense(1024),
        #layers.Activation(activations.selu),
        #layers.Dense(512),
        layers.Activation(activations.selu),
       # layers.Dense(256),
       # layers.Activation(activations.selu),
        layers.Dense(128),
        layers.Activation(activations.selu),
        layers.Dense(64),
        layers.Activation(activations.selu),
        layers.Dense(55)
    ])

    optimizer = keras.optimizers.Adamax()

    model.compile(loss="mse",
                  optimizer=optimizer,
                  metrics=[scoringSystem])

    return model


def sequentialDropout(inputs, lr_rate):
    dropout_rate = 0.3
    model = keras.Sequential([
        layers.Dropout(dropout_rate),
        keras.layers.InputLayer((55, 300)),
        keras.layers.Flatten(),
        #layers.Dropout(dropout_rate),
        layers.Activation(activations.selu),
        layers.Dense(256),
        layers.Activation(activations.selu),
        #layers.Dropout(dropout_rate),
        layers.Dense(128),
        layers.Activation(activations.selu),
        #layers.Dropout(dropout_rate),
        layers.Dense(64),
        layers.Activation(activations.selu),
        #layers.Dropout(dropout_rate),
        #layers.Dense(32),
        #layers.Dropout(dropout_rate),
        layers.Dense(55)
    ])

    optimizer = keras.optimizers.Adam(learning_rate=lr_rate)

    model.compile(loss="mse",
                  optimizer=optimizer,
                  metrics=[scoringSystem])
    return model


def lstmModel(inputs):
    model = keras.Sequential([
        layers.LSTM(inputs),
        layers.LSTM(1)
    ])
    optimizer = keras.optimizers.Adam()

    model.compile(loss="mse",
                  optimizer=optimizer,
                  metrics=[scoringSystem])

    return model


def cnnModel(number_of_measurements):
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(55, number_of_measurements)),
        layers.Conv1D(filters=32, kernel_size=5),
        layers.MaxPooling1D(pool_size=2),
        layers.Activation(activations.selu),
        layers.Conv1D(filters=32, kernel_size=5),
        layers.MaxPooling1D(pool_size=3),
        layers.Activation(activations.selu),
        layers.Conv1D(filters=32, kernel_size=5),
        layers.MaxPooling1D(pool_size=3),
        layers.Flatten(),
        layers.Dense(1024),
        layers.Activation(activations.selu),
        layers.Dense(1024),
        layers.Activation(activations.selu),
        layers.Dense(1024),
        layers.Activation(activations.selu),
        layers.Dense(1024),
        layers.Activation(activations.selu),
        layers.Dense(55),
    ])
    optimizer = keras.optimizers.Adam(learning_rate=0.0001)

    model.compile(loss="mse",
                  optimizer=optimizer,
                  metrics=[scoringSystem])

    return model


def cnnModelFilters(number_of_measurements):
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(55, number_of_measurements)),
        layers.Conv1D(filters=32, kernel_size=5),
        layers.MaxPooling1D(pool_size=2),
        layers.Activation(activations.selu),
        layers.Conv1D(filters=32, kernel_size=5),
        layers.MaxPooling1D(pool_size=2),
        layers.Activation(activations.selu),
        layers.Conv1D(filters=32, kernel_size=5),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(1024),
        layers.Activation(activations.selu),
        layers.Dense(512),
        layers.Activation(activations.selu),
        layers.Dense(256),
        layers.Activation(activations.selu),
        layers.Dense(128),
        layers.Activation(activations.selu),
        layers.Dense(55),
    ])
    optimizer = keras.optimizers.Adam(learning_rate=0.0001)

    model.compile(loss="mae",
                  optimizer=optimizer,
                  metrics=[scoringSystem])

    return model

In [4]:
#@title
import pandas as pd
import numpy as np


def importTrainingData(files, stop_read, count):
    # Import Training files
    if count%2 == 0:
        print("Importing Training Data...", "Iteration ", count)
    else:
        print("Importing Validation Data...", "Iteration ", count-1)
    training_list = []

    # Add first n files to list
    for filename in files:
        df = pd.read_csv(filename, header=None, sep="\t", skiprows=6)
        training_list.append(df)
        count = count + 1
        if count == stop_read:
            break

    # Convert List content to Dataframe
    train_data = pd.concat(training_list, axis=0, ignore_index=True)

    return train_data


def importParamsData(files, stop_read, count):
    # Import Params
    if count % 2 == 0:
        print("Importing Training Params...", "Iteration ", count)
    else:
        print("Importing Validation Params...", "Iteration ", count - 1)
    params_list = []

    # Add first n files to list
    for filename in files:
        df = pd.read_csv(filename, header=None, sep="\t", skiprows=2)
        params_list.append(df)
        count = count + 1
        if count == stop_read:
            break

    # Convert List content to Dataframe
    params_data = pd.concat(params_list, axis=0, ignore_index=True)

    return params_data

def importTestData(test_files):
    li = []

    for filename in test_files:
        df = pd.read_csv(filename, header=None, sep="\t", skiprows=6)
        li.append(df)

    test_frame = pd.concat(li, axis=0, ignore_index=True)

    return test_frame

def importTargetParameter(files, stop_read, count):
    train_parameters = []
    # Add first n files to list
    for filename in files:
        df = pd.read_csv(filename, header=None, sep=" ", nrows=2)
        train_parameters.append(df)
        count = count + 1
        if count == stop_read:
            break

    # Convert List content to Dataframe
    parameter_data = pd.concat(train_parameters, axis=0, ignore_index=True)

    return parameter_data


def importTrainParameter(files, stop_read, count):
    train_parameters = []

    # Add first n files to list
    for filename in files:
        df = pd.read_csv(filename, header=None, sep=" ", nrows=6)
        for x in range(55):
            train_parameters.append(df)
        count = count + 1
        if count == stop_read:
            break

    # Convert List content to Dataframe
    parameter_data = pd.concat(train_parameters, axis=0, ignore_index=True)

    return parameter_data

def importTrainParameterSingle(files, stop_read, count):
    train_parameters = []

    # Add first n files to list
    for filename in files:
        df = pd.read_csv(filename, header=None, sep=" ", nrows=6)
        #for x in range(55):
        train_parameters.append(df)
        count = count + 1
        if count == stop_read:
            break

    # Convert List content to Dataframe
    parameter_data = pd.concat(train_parameters, axis=0, ignore_index=True)

    return parameter_data

def importEvalParameter(files):
    eval_parameters = []

    # Add first n files to list
    for filename in files:
        df = pd.read_csv(filename, header=None, sep=" ", nrows=6)
        for x in range(55):
            eval_parameters.append(df)

    # Convert List content to Dataframe
    parameter_data = pd.concat(eval_parameters, axis=0, ignore_index=True)

    return parameter_data

In [5]:
#@title
#Tensorflow
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras import layers, activations
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from sklearn.ensemble import RandomForestRegressor

#Pandas and numpy for data formats
import pandas as pd
import numpy as np

#glob for data import
import glob
import random
#PyTorch
import torch
import torch.nn as nn
#SK Learn
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#from sklearn.utils import shuffle
import math
import sys
#MatPlotLib
import matplotlib.pyplot as plt

#importTrainData()
train_data = np.load("/content/drive/MyDrive/BA_data/Data 40000/train_40000.npy")
params_data = np.load("/content/drive/MyDrive/BA_data/Data 40000/train_target_40000.npy")
#train_parameter = np.load("../train_parameter_40000.npy")
#
validation_data = np.load("/content/drive/MyDrive/BA_data/Data 40000/val_train_40000.npy")
validation_params = np.load("/content/drive/MyDrive/BA_data/Data 40000/val_target_40000.npy")
#validation_parameter = np.load("../val_parameter_40000.npy")

#Create stats for zScore
mean_value_target = np.mean(params_data)
standard_deviation_target = np.std(params_data)

mean_value = np.mean(train_data)
standard_deviation = np.std(train_data)
measurements_to_remove = 30


median_train = False
replaceValuesBiggerOne_train=False
replaceFirstXValues_train=False
removeFirstXValues_train=True
normalizeData_train=False
zScore_train=True

train_data = preprocessData(train_data,
                          median=median_train,
                          replaceValuesBiggerOne = replaceValuesBiggerOne_train,
                          replaceFirstXValues = replaceFirstXValues_train,
                          removeFirstXValues = removeFirstXValues_train,
                          normalizeData = normalizeData_train,
                          zScore = zScore_train,
                          zScore_mean = mean_value,
                          zScore_sd = standard_deviation,
                          number_to_remove = measurements_to_remove)

train_target = preprocessData(params_data,
                          median=False,
                          replaceValuesBiggerOne=False,
                          replaceFirstXValues=False,
                          removeFirstXValues=False,
                          normalizeData=False,
                          zScore=False,
                          zScore_mean=mean_value_target,
                          zScore_sd=standard_deviation_target,
                          number_to_remove=measurements_to_remove)

validation_data = preprocessData(validation_data,
                          median = median_train,
                          replaceValuesBiggerOne = replaceValuesBiggerOne_train,
                          replaceFirstXValues = replaceFirstXValues_train,
                          removeFirstXValues = removeFirstXValues_train,
                          normalizeData = normalizeData_train,
                          zScore = zScore_train,
                          zScore_mean = mean_value,
                          zScore_sd = standard_deviation,
                          number_to_remove = measurements_to_remove)

validation_target = preprocessData(validation_params,
                          median=False,
                          replaceValuesBiggerOne=False,
                          replaceFirstXValues=False,
                          removeFirstXValues=False,
                          normalizeData=False,
                          zScore=False,
                          zScore_mean=mean_value_target,
                          zScore_sd=standard_deviation_target,
                          number_to_remove=measurements_to_remove)


number_of_measurements = 300
if removeFirstXValues_train == True:
    number_of_measurements = 300 - measurements_to_remove    
    
#reshaping train data
if type(train_data) != np.ndarray:
    train_data = train_data.to_numpy().reshape(-1, 55, number_of_measurements)
else:
    train_data = train_data.reshape(-1, 55, number_of_measurements)

#reshaping target data
if type(params_data) != np.ndarray:
    params_data = params_data.to_numpy()

#reshaping val data
if type(validation_data) != np.ndarray:
    validation_data = validation_data.to_numpy().reshape(-1, 55, number_of_measurements)
else:    
    validation_data = validation_data.reshape(-1, 55, number_of_measurements)

#reshaping val target
if type(validation_params) != np.ndarray:
    validation_params = validation_params.to_numpy()

    
def scaleToMaxToOne(dat, min, max):
    scaled_dat = dat / max
    return scaled_dat

def unscaleMaxToOne(scaled_dat, min, max):
    unscaled_dat = scaled_dat * max
    return unscaled_dat

min_target = np.min(train_target) 
max_target = np.max(train_target)
print(f"Rescaling targets with max = {max_target}")
train_target = scaleToMaxToOne(train_target, min_target, max_target)
val_target = scaleToMaxToOne(validation_target, min_target, max_target)    

prevent_overfitting = keras.callbacks.EarlyStopping(monitor="val_loss", patience = 20, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=10, min_delta=0.0001)

Rescaling targets with max = 0.6713607926990712


In [6]:
train_data_reshaped = train_data.reshape(2200000, 270)

In [7]:
train_target_reshaped = train_target.reshape(2200000,)

In [8]:
validation_data_reshaped = validation_data.reshape(550000, 270)

##Model 1

In [9]:
rfr = RandomForestRegressor(n_estimators = 100, verbose=1, n_jobs = -1)

rfrModel = rfr.fit(train_data_reshaped, train_target_reshaped)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 238.3min finished


In [10]:
prediction = rfrModel.predict(validation_data_reshaped)

[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    2.9s finished


In [14]:
pred_final = unscaleMaxToOne(prediction, min_target, max_target)

pred_final_reshape = pred_final.reshape(10000,55)

score = scoringSystem(validation_target, pred_final_reshape)
print(score)

tf.Tensor(550000.0, shape=(), dtype=float64)
tf.Tensor(9135.374302616452, shape=(), dtype=float64)


##Model 2

In [16]:
rfr2 = RandomForestRegressor(n_estimators = 100, verbose=1, n_jobs = -1)

rfr2Model = rfr2.fit(train_data_reshaped, train_target_reshaped)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 224.0min finished


In [17]:
prediction2 = rfr2Model.predict(validation_data_reshaped)

[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 100 out of 100 | elapsed:    3.0s finished


In [18]:
pred_final2 = unscaleMaxToOne(prediction2, min_target, max_target)

pred_final_reshape2 = pred_final2.reshape(10000,55)

score2 = scoringSystem(validation_target, pred_final_reshape)
print(score2)

tf.Tensor(550000.0, shape=(), dtype=float64)
tf.Tensor(9135.374302616452, shape=(), dtype=float64)


##Model 3

In [None]:
rfr3 = RandomForestRegressor(n_estimators = 100, verbose=1, n_jobs = -1)

rfr3Model = rfr3.fit(train_data_reshaped, train_target_reshaped)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 40 concurrent workers.


In [None]:
prediction3 = rfr3Model.predict(validation_data_reshaped)

In [None]:
pred_final3 = unscaleMaxToOne(prediction3, min_target, max_target)

pred_final_reshape3 = pred_final3.reshape(10000,55)

score3 = scoringSystem(validation_target, pred_final_reshape)
print(score3)

##Evaluation

In [None]:
#per Hand