In [1]:
import tensorflow as tf

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
from tensorflow.keras.layers import Normalization, Dense, InputLayer
from tensorflow.keras.losses import MeanSquaredError, Huber, MeanAbsoluteError
from tensorflow.keras.metrics import RootMeanSquaredError,

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/ManonYa09/Machine-_Learning_G4/refs/heads/main/Dataset/train%202.csv')

In [5]:
df.head()

Unnamed: 0,v.id,on road old,on road now,years,km,rating,condition,economy,top speed,hp,torque,current price
0,1,535651,798186,3,78945,1,2,14,177,73,123,351318.0
1,2,591911,861056,6,117220,5,9,9,148,74,95,285001.5
2,3,686990,770762,2,132538,2,8,15,181,53,97,215386.0
3,4,573999,722381,4,101065,4,3,11,197,54,116,244295.5
4,5,691388,811335,6,61559,3,9,12,160,53,105,531114.5


In [6]:
# sns.pairplot(df[['years', 'km', 'rating', 'condition', 'economy', 'top speed', 'hp', 'torque', 'current price']], diag_kind='kde')

In [7]:
tf.constant(df)

<tf.Tensor: shape=(1000, 12), dtype=float64, numpy=
array([[1.000000e+00, 5.356510e+05, 7.981860e+05, ..., 7.300000e+01,
        1.230000e+02, 3.513180e+05],
       [2.000000e+00, 5.919110e+05, 8.610560e+05, ..., 7.400000e+01,
        9.500000e+01, 2.850015e+05],
       [3.000000e+00, 6.869900e+05, 7.707620e+05, ..., 5.300000e+01,
        9.700000e+01, 2.153860e+05],
       ...,
       [9.980000e+02, 6.463440e+05, 8.427330e+05, ..., 1.130000e+02,
        8.900000e+01, 4.058710e+05],
       [9.990000e+02, 5.355590e+05, 7.324390e+05, ..., 1.120000e+02,
        1.280000e+02, 7.439800e+04],
       [1.000000e+03, 5.901050e+05, 7.797430e+05, ..., 9.900000e+01,
        9.600000e+01, 4.149385e+05]])>

In [8]:
df1 = tf.random.shuffle(df )

In [9]:
y = df1[:, -1]

In [10]:
x = df1[:, 3:-1]

In [11]:
x

<tf.Tensor: shape=(1000, 8), dtype=float64, numpy=
array([[2.00000e+00, 7.16460e+04, 3.00000e+00, ..., 1.67000e+02,
        6.10000e+01, 1.36000e+02],
       [3.00000e+00, 1.32274e+05, 4.00000e+00, ..., 1.47000e+02,
        5.40000e+01, 1.00000e+02],
       [5.00000e+00, 7.60960e+04, 5.00000e+00, ..., 1.62000e+02,
        9.10000e+01, 8.30000e+01],
       ...,
       [5.00000e+00, 9.59210e+04, 2.00000e+00, ..., 1.95000e+02,
        1.01000e+02, 9.60000e+01],
       [5.00000e+00, 5.85480e+04, 3.00000e+00, ..., 1.82000e+02,
        1.08000e+02, 8.50000e+01],
       [7.00000e+00, 7.06000e+04, 1.00000e+00, ..., 1.61000e+02,
        5.30000e+01, 7.50000e+01]])>

### Cross Vidalidation with Tensorflow

In [13]:
train_tatio = 0.8
val_ratio = 0.1
test_ratio = 0.1
dataset_size = len(x)

### 1. Data Preprocessing

#### 1.1 Prepare data

##### 1.1.1 Data Training

- `tf.data.Dataset.from_tensor_slices` slices the input tensors (or arrays) element-wise to create a dataset.
- Each slice corresponds to one training example (x, y) where x is a feature vector, and y is its label.
- It is commonly used to load data into TensorFlow models in an efficient and organized way.

`batch(32)` :

- Purpose: Combines consecutive elements of the dataset into batches of size 32.
Why?:
    - Training models with batches allows for more efficient computation (parallel processing on GPUs/TPUs).
    - It also balances computational speed and memory usage.


`prefetch(tf.data.AUTOTUNE)`

- Purpose: Optimizes the data pipeline by preloading the next batch of data while the current batch is being processed by the model.
- tf.data.AUTOTUNE: Automatically determines the optimal number of batches to prefetch based on system performance (CPU, GPU, etc.).
- This ensures that the model is not waiting for data to be loaded, keeping the GPU/TPU utilized.

In [20]:
# Correctly splitting the data for train and validation sets
x_train = x[:int(train_tatio * dataset_size)]
y_train = y[:int(train_tatio * dataset_size)]
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=8, reshuffle_each_iteration=True).batch(32).prefetch(tf.data.AUTOTUNE)

# Correct validation split


##### 1.1.2 Data Validation

In [22]:
x_val = x[int(train_tatio * dataset_size):int((train_tatio + val_ratio) * dataset_size)]
y_val = y[int(train_tatio * dataset_size):int((train_tatio + val_ratio) * dataset_size)]
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.shuffle(buffer_size=8, reshuffle_each_iteration=True).batch(32).prefetch(tf.data.AUTOTUNE)

##### 1.1.3 data Testing

In [24]:
x_test = x[int(dataset_size*(train_tatio+val_ratio)):]
y_test = y[int(dataset_size*(train_tatio+val_ratio)):]
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_dataset = test_dataset.shuffle(buffer_size = 8, reshuffle_each_iteration = True).batch(32).prefetch(tf.data.AUTOTUNE)

#### 1.2 Data Scaling with Standard Scaling

In [3]:
normalizer = Normalization()
normalizer.adapt(x_train)

NameError: name 'Normalization' is not defined

### Model creation and training

In [28]:
model = tf.keras.Sequential([
                             InputLayer(input_shape = (8,)),
                             normalizer,
                             Dense(128, activation = "relu"),
                             Dense(128, activation = "relu"),
                             Dense(128, activation = "relu"),
                             Dense(1),])

In [29]:
# tf.keras.utils.plot_model(model, to_file = "model.png", show_shapes=True)

In [30]:
model.compile(loss='mean_absolute_error',
              metrics=['root_mean_squared_error'])

In [31]:
val_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [32]:
train_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 8), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [33]:
# train_dataset.shape

In [34]:
type(train_dataset)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset

In [35]:
type(val_dataset)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset

An `epoch` consists of passing a dataset through the algorithm completely. Each `Epoch` consists of many weight update steps.

In [36]:
model.fit(train_dataset, validation_data=val_dataset, epochs=300)

Epoch 1/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 311379.1875 - root_mean_squared_error: 337048.9062 - val_loss: 303070.0312 - val_root_mean_squared_error: 328407.5938
Epoch 2/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443us/step - loss: 311120.1250 - root_mean_squared_error: 336804.4375 - val_loss: 302859.0938 - val_root_mean_squared_error: 328213.3438
Epoch 3/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 509us/step - loss: 310799.2188 - root_mean_squared_error: 336451.8438 - val_loss: 302410.7188 - val_root_mean_squared_error: 327801.1250
Epoch 4/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 481us/step - loss: 310964.3438 - root_mean_squared_error: 336638.6250 - val_loss: 301624.0000 - val_root_mean_squared_error: 327077.5625
Epoch 5/300
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445us/step - loss: 310009.9062 - root_mean_squared_error: 335860.6562 - va

<keras.src.callbacks.history.History at 0x31a8e2810>