In [33]:
import pandas as pd
import tensorflow.keras.layers as tfl
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [34]:
train = pd.read_csv("data/train_prepared.csv")
test = pd.read_csv("data/test_prepared.csv")

In [35]:
train.columns

Index(['id', 'Rating', 'maincateg', 'platform', 'price1', 'actprice1',
       'norating1', 'noreviews1', 'star_5f', 'star_4f', 'star_3f', 'star_2f',
       'star_1f', 'fulfilled1', 'combo', 'category'],
      dtype='object')

In [36]:
train.drop("id", axis=1, inplace=True)
test_id = test.id
test.drop("id", axis=1, inplace=True)

X = train.drop(["price1"], axis=1)
y = train["price1"]
assert X.shape[1] == test.shape[1], "X and test have different number of columns"

In [37]:
columns_to_one_hot = ["maincateg", "category", "platform", "combo"]
columns_to_standardize = ["actprice1", 'norating1', 'noreviews1', 'star_5f', 
    'star_4f', 'star_3f', 'star_2f', 'star_1f',]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)
standardizer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one_hot_encoder", one_hot_encoder, columns_to_one_hot),
        # ("standardizer", standardizer, columns_to_standardize)
        ],
    remainder="passthrough"
)

In [41]:
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
test = preprocessor.transform(test)

In [42]:
assert X_train.shape[1] == test.shape[1], "X and test have different number of columns"

In [43]:
input_shape = X_test.shape[1]

In [44]:
input_ = tfl.Input(shape=(input_shape,))
x = tfl.Dense(units=256, activation="relu")(input_)
x = tfl.Dense(units=128, activation="relu")(x)
x = tfl.Dense(units=64, activation="relu")(x)
x = tfl.Dense(units=32, activation="relu")(x)
x = tfl.Dense(units=16, activation="relu")(x)
x = tfl.Dense(units=8, activation="relu")(x)
output = tfl.Dense(units=1, activation="linear")(x)
model = Model(inputs=input_, outputs=output)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 27)]              0         
                                                                 
 dense_13 (Dense)            (None, 256)               7168      
                                                                 
 dense_14 (Dense)            (None, 128)               32896     
                                                                 
 dense_15 (Dense)            (None, 64)                8256      
                                                                 
 dense_16 (Dense)            (None, 32)                2080      
                                                                 
 dense_17 (Dense)            (None, 16)                528       
                                                                 
 dense_18 (Dense)            (None, 8)                 136 

In [45]:
model.compile(loss="mse", optimizer="adam", metrics=["mse", "mae"])

In [46]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [28]:
preds = model.predict(test)
preds = tf.squeeze(preds).numpy()
preds



array([416.96716, 315.97168, 493.39142, ..., 432.57184, 260.3942 ,
       657.8312 ], dtype=float32)

In [29]:
sample = pd.read_csv('data/Sample__submission.csv')

In [31]:
sample.price1 = preds
sample.to_csv('data/submission_final.csv', index=False)

In [32]:
!kaggle competitions submit -c sa2022 -f data/submission_final.csv -m ""

Successfully submitted to Final Capstone Project



  0%|          | 0.00/82.3k [00:00<?, ?B/s]
 10%|▉         | 8.00k/82.3k [00:00<00:01, 60.3kB/s]
100%|██████████| 82.3k/82.3k [00:05<00:00, 14.4kB/s]
