In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras import layers

In [2]:
file_path = "/Users/kiyoshitakeuchi/Desktop/Machine Learning/store-sales-time-series-forecasting/"
file_name = "train.csv"
test_file_name = "test.csv"

In [3]:
train_set = pd.read_csv(file_path + file_name)

In [4]:
label = "sales"
test_set = pd.read_csv(file_path + test_file_name)

In [5]:
def features_divider_names(data_frame):
    categorical = ["store_nbr"]
    numerical = [name for name in data_frame.columns if name not in categorical]
    return [categorical, numerical]

categorical, numerical = features_divider_names(train_set)

In [6]:
def remove_nan(data_frame): 
    mean = data_frame.select_dtypes(include = 'number').median()
    mean = mean.to_dict()
    data_frame.fillna(value=mean, inplace=True)
    data_frame.fillna(value="?", inplace=True)
    return

remove_nan(test_set)
remove_nan(train_set)

In [7]:
numerical_categorical_features = ["store_nbr"]
ncf_dict = {name:"str" for name in numerical_categorical_features}
test_set = test_set.astype(ncf_dict)

In [8]:
features = train_set.copy().astype(ncf_dict)
labels = features.pop(label)

In [11]:
features_dict = {name: np.array(value) 
                         for name, value in features.items()}
test_features_dict = {name: np.array(value) 
                         for name, value in test_set.copy().items()}

In [14]:
def preprocess_data(data_frame):    
    inputs = {}

    for name, column in data_frame.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
        
    numeric_inputs = {name:input for name, input in inputs.items() 
                      if input.dtype==tf.float32}


    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = layers.Normalization()
    norm.adapt(np.array(train_set[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    preprocessed_inputs = [all_numeric_inputs]
    
    for name, input in inputs.items():
        if input.dtype == tf.float32 and name not in categorical:
            continue 
            
        if name in categorical:
            data_frame[name] = data_frame[name].astype(str)
        
        lookup = tf.keras.layers.StringLookup(vocabulary=np.unique(data_frame[name]))
        
            
        one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
        x = lookup(input)
        x = one_hot(x)
        preprocessed_inputs.append(x)
    
    preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

    preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)
    
    return [inputs, preprocessing]

In [15]:
def model(data_frame):
    body = tf.keras.Sequential([
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])
    
    inputs, preprocessing_head = preprocess_data(data_frame)
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)
    

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.01))
    return model

In [16]:
model = model(features)

In [17]:
model.fit(x=features_dict, y=labels, validation_split = 0.2, epochs=10)

Epoch 1/10
 2005/75023 [..............................] - ETA: 17:14 - loss: 137.2235

KeyboardInterrupt: 

In [None]:
array = model.predict(test_features_dict)

In [None]:
import csv

with open('venezuela_solution_8.csv', 'w', newline='') as file:
    fieldnames = ['id', 'sales']
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    writer.writeheader()
    n = 3000888
    for element in array:
        writer.writerow({'id': n, 'sales': element[0]})
        n += 1