In [2]:
%load_ext autoreload
%autoreload 2

# DATA MANIPULATION
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns

# VIEWING OPTIONS IN THE NOTEBOOK
from sklearn import set_config; set_config(display='diagram')

In [11]:
data = pd.read_csv('../goodtrainbadtrain/data/data_for_model.csv')

data.head()
X = data[['zugnr', 'city', 'temp', 'coco', 'weekday', 'month', 'time_of_day']]
y = data['target_numeric']

X_test = X.loc[100]

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((91195, 7), (39084, 7), (91195,), (39084,))

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

def create_preproc_ordinal():
    feat_ordinal_dict = {
        # considers "missing" as the worse category
        "coco": ['good', 'medium', 'bad', 'extreme']
    }

    feat_ordinal = sorted(feat_ordinal_dict.keys())
    feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]

    encoder_ordinal = OrdinalEncoder(
        categories=feat_ordinal_values_sorted,
        handle_unknown="use_encoded_value",
        unknown_value=-1  # Considers unknown values as worse than "missing"
    )

    preproc_ordinal = make_pipeline(
        SimpleImputer(strategy="constant", fill_value="missing"),
        encoder_ordinal,
        MinMaxScaler()
    )

    return preproc_ordinal, feat_ordinal


def create_preproc_numerical():
    return make_pipeline(
        KNNImputer(),
        MinMaxScaler()
    )

def create_preproc_nominal():
    return make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(handle_unknown="ignore")
    )

In [14]:
def create_preproc(X, percentile = 75):
    """Create an (unfitted) sklearn preprocessing pipeline well suited for the Houses Kaggle Competition

    Args:
        X (pandas.DataFrame): Feature matrix
        percentile (float): Percentage of feature to keep after one hot encoding

    Returns:
        sklearn.pipeline.Pipeline: un-fitted preprocessor
    """
    preproc_ordinal, feat_ordinal = create_preproc_ordinal()
    
    preproc_numerical = create_preproc_numerical()
    feat_numerical = sorted(X.select_dtypes(
        include=["int64", "float64"]).columns)
    
    preproc_nominal = create_preproc_nominal()
    feat_nominal = sorted(
        list(set(X.columns) - set(feat_numerical) - set(feat_ordinal)))
    
    feature_transformer = ColumnTransformer(
        [
        ("numerical_encoder", preproc_numerical, feat_numerical),
        ("ordinal_encoder", preproc_ordinal, feat_ordinal),
        ("nominal_encoder", preproc_nominal, feat_nominal)
        ],
        remainder="drop")

    feature_selector = SelectPercentile(
        mutual_info_regression,
        percentile=percentile,  # keep the xx% most important features
    )

    preproc = make_pipeline(
        feature_transformer,
        feature_selector
    )
    return preproc

In [15]:
preproc = create_preproc(X_train)
preproc

In [16]:
# Fit the preprocessor on the train set
preproc.fit(X_train, y_train)


In [18]:
# Create the preprocessed versions of X_train and X_val
X_train_preproc = preproc.transform(X_train)
X_val_preproc = preproc.transform(X_val)

# Let's also already create the preprocessed version of X_test for our future predictions
#X_test_preproc = preproc.transform(X_test)

In [19]:
# Shapes before preprocessing
X_train.shape, X_val.shape, X_test.shape

((91195, 7), (39084, 7), (7,))

In [21]:
# Shapes after preprocessing
X_train_preproc.shape, X_val_preproc.shape #,X_test_preproc.shape

((91195, 121), (39084, 121))

In [22]:
from tensorflow.keras import Sequential, layers

def initialize_model(X):

    #################################
    #  1 - Model architecture       #
    #################################
    
    model = Sequential()
    
    # Input Layer
    model.add(layers.Dense(20, activation='relu', input_dim = X.shape[-1]))
    
    # Hidden Layers
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(20, activation='relu'))
    
    # Predictive Layer
    model.add(layers.Dense(1, activation='linear'))
    
    ##################################
    #  2 - Our recommended compiler  #
    ##################################
    
    model.compile(optimizer='adam',
                  loss='msle')      # directly optimize for the squared log error!
    
    return model

In [23]:
# 1. Initializing a NeuralNet with its architecture and its compilation method
model = initialize_model(X_train_preproc)
model.summary()

# 2. Training the model
epochs = 500
batch_size = 16

history = model.fit(X_train_preproc, 
                    y_train,
                    validation_data = (X_val_preproc, y_val),
                    epochs = epochs,         # Play with this until your validation loss overfit
                    batch_size = batch_size, # Let's keep a small batch size for faster iterations
                    verbose = 0)

# 3. Evaluating the model
res = model.evaluate(X_val_preproc, y_val, verbose = 0)
print(f"RMLSE achieved after {epochs} epochs = {round(res**0.5,3)}")

# 4. Looking at the lowest loss
minimium_rmlse_val = min(history.history['val_loss'])**0.5
optimal_momentum = np.argmin(history.history['val_loss'])

print(f"Lowest RMLSE achieved = {round(minimium_rmlse_val,3)}")
print(f"This was achieved at the epoch number {optimal_momentum}") 

2022-06-01 13:59:33.113498: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 20)                2440      
                                                                 
 dense_1 (Dense)             (None, 15)                315       
                                                                 
 dense_2 (Dense)             (None, 15)                240       
                                                                 
 dense_3 (Dense)             (None, 20)                320       
                                                                 
 dense_4 (Dense)             (None, 1)                 21        
                                                                 
Total params: 3,336
Trainable params: 3,336
Non-trainable params: 0
_________________________________________________________________


2022-06-01 13:59:35.498111: W tensorflow/core/framework/op_kernel.cc:1733] INVALID_ARGUMENT: TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 273, in __call__
    return func(device, token, args)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 151, in __call__
    outputs = self._call(device, args)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 158, in _call
    ret = self._func(*args)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 649, in wrapper
    return func(*args, **kwargs)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/data_adapter.py", line 476, in py

InvalidArgumentError:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 273, in __call__
    return func(device, token, args)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 151, in __call__
    outputs = self._call(device, args)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 158, in _call
    ret = self._func(*args)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 649, in wrapper
    return func(*args, **kwargs)

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/data_adapter.py", line 476, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/data_adapter.py", line 476, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/data_adapter.py", line 474, in slice_array
    return training_utils.slice_arrays(data, ind.numpy(),

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/training_utils.py", line 47, in slice_arrays
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/training_utils.py", line 47, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "/Users/mariegramm/.pyenv/versions/lewagon/lib/python3.8/site-packages/keras/engine/training_utils.py", line 47, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_845]

In [None]:
def plot_history(history):
    plt.plot(np.sqrt(history.history['loss']))
    plt.plot(np.sqrt(history.history['val_loss']))
    plt.title('Model Loss')
    plt.ylabel('RMSLE')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='best')
    plt.show()

# PREDICTION

In [None]:
#get X_test data
X_test_preproc = preproc.transform(X_test)