In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.activations import linear, relu, sigmoid

In [None]:
# new data

data_new=pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')


# old data

data_old=pd.read_csv('/kaggle/input/old-data/used_cars.csv')


# reformatting old data

data_old['milage'] = data_old['milage'].str.replace('mi.', '')
data_old['milage'] = data_old['milage'].str.replace(',', '')
data_old['price'] = data_old['price'].str.replace('$', '')
data_old['price'] = data_old['price'].str.replace(',', '')

data_old['milage']=data_old['milage'].apply(pd.to_numeric)
data_old['price']=data_old['price'].apply(pd.to_numeric)

new_id=pd.DataFrame(np.arange(188533,192542,1),columns=['id'])
data_old=pd.concat([new_id,data_old],axis=1)

# final data

data=pd.concat([data_new,data_old],axis=0,ignore_index=True)
data.head()

In [None]:
# one-hot encoding

numerical_data=data[['milage','model_year']]
categorical_data=data[['fuel_type','accident','clean_title','brand','transmission','engine']]


ohe=OneHotEncoder(handle_unknown='ignore').fit(categorical_data)
ohe_data = ohe.transform(categorical_data).toarray()

In [None]:
# normalize numerical data

norm_l = tf.keras.layers.Normalization(axis=-1)
norm_l.adapt(np.array(numerical_data))  # learns mean, variance
numerical_data_norm=pd.DataFrame(norm_l(numerical_data),columns=['milage','model_year'])
numerical_data_norm

In [None]:
X=pd.concat([numerical_data_norm,pd.DataFrame(ohe_data)],axis=1)
X

In [None]:
# input shape

X.shape

In [None]:
# output 

y=data[['price']]

# output shape

y.shape

In [None]:
# NN

tf.random.set_seed(1234)  # applied to achieve consistent results
model = Sequential(
    [
        tf.keras.Input(shape=(X.shape[1],)),
        Dense(256, activation='relu'),
        Dense(128,activation='relu'),
        Dense(32,activation='relu'),
        Dense(1, activation='linear')
     ]
)

In [None]:
model.summary()

In [None]:
callback=tf.keras.callbacks.EarlyStopping(monitor="val_loss",patience=10,restore_best_weights=True)


model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
)

history = model.fit(
    X,y,
    validation_split=0.2,
    epochs=100,
    callbacks=[callback]
)

In [None]:
# plotting history

plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title('model: loss vs epoch ')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['validation','training'], loc='upper left')
plt.show()

In [None]:
# prediction on unlabeled data

unlabeled_data=pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
test_ids=unlabeled_data['id']
numerical_data=unlabeled_data[['milage','model_year']]
categorical_data=unlabeled_data[['fuel_type','accident','clean_title','brand','transmission','engine']]
ohe_data = ohe.transform(categorical_data).toarray()

numerical_data_norm=pd.DataFrame(norm_l(numerical_data),columns=['milage','model_year'])

X=pd.concat([numerical_data_norm,pd.DataFrame(ohe_data)],axis=1)

predictions=model.predict(X)


# submission file 

print('Generating submission.csv file...')

# Write the submission file
np.savetxt(
    'submission.csv',
    np.rec.fromarrays([test_ids, predictions.flatten()]),
    fmt=['%s', '%d'],
    delimiter=',',
    header='id,label',
    comments='',
)

# Look at the first few predictions
!head submission.csv