# Load Packages

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import layers

tf.__version__

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
# import dataset 
raw_dataset = pd.read_csv("car_prices.csv", on_bad_lines="skip")


In [4]:
df = raw_dataset.copy()

In [5]:
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331.0,gray,black,financial services remarketing (lease),31900,30000,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282.0,white,black,volvo na rep/world omni,27500,27750,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641.0,gray,black,financial services remarketing (lease),66000,67000,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558811 entries, 0 to 558810
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          558811 non-null  int64  
 1   make          548510 non-null  object 
 2   model         548412 non-null  object 
 3   trim          548160 non-null  object 
 4   body          545616 non-null  object 
 5   transmission  493458 non-null  object 
 6   vin           558811 non-null  object 
 7   state         558811 non-null  object 
 8   condition     547017 non-null  float64
 9   odometer      558717 non-null  float64
 10  color         558062 non-null  object 
 11  interior      558062 non-null  object 
 12  seller        558811 non-null  object 
 13  mmr           558811 non-null  int64  
 14  sellingprice  558811 non-null  int64  
 15  saledate      558811 non-null  object 
dtypes: float64(2), int64(3), object(11)
memory usage: 68.2+ MB


In [7]:
# drop column with too many missing values
df = df.drop(['transmission'], axis=1)

In [8]:
# drop remaining row with one missing value
df = df.dropna()

In [9]:
# Drop irrelevant features
df = df.drop(['trim', 'vin', 'mmr', 'saledate'], axis=1)

In [10]:
# rename columns
df = df.rename(columns={
"make" : "brand",
"body" : "type",
"odometer" : "miles"} 
    )

In [11]:
# transform into lowercase
df["brand"] = df["brand"].str.lower()
df["model"] = df["model"].str.lower()
df["type"] = df["type"].str.lower()

## Define label

In [12]:
y_label = 'sellingprice'

## Data format

In [13]:
# Make a dictionary with int64 featureumns as keys and np.int32 as values
int_32 = dict.fromkeys(df.select_dtypes(np.int64).columns, np.int32)
# Change all columns from dictionary
df = df.astype(int_32)

# Make a dictionary with float64 columns as keys and np.float32 as values
float_32 = dict.fromkeys(df.select_dtypes(np.float64).columns, np.float32)
df = df.astype(float_32)

In [14]:
int_32

{'year': numpy.int32, 'sellingprice': numpy.int32}

In [15]:
# Convert to categorical

# make a list of all categorical variables
cat_convert = ["brand", "model", "type", "state", "color", "interior", "seller"]

# convert variables
for i in cat_convert:
    df[i] = df[i].astype("string")

In [16]:
# Convert to category
df['year'] = df['year'].astype("category")
df['condition'] = df['condition'].astype("category")

In [17]:
# Make list of all numerical data (except label)
list_num = df.drop(columns=[y_label]).select_dtypes(include=[np.number]).columns.tolist()

# Make list of all categorical data which is stored as integers (except label)
list_cat_int = df.drop(columns=[y_label]).select_dtypes(include=['category']).columns.tolist()

# Make list of all categorical data which is stored as string (except label)
list_cat_string = df.drop(columns=[y_label]).select_dtypes(include=['string']).columns.tolist()

In [18]:
list_num

['miles']

In [19]:
list_cat_int

['year', 'condition']

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533660 entries, 0 to 558810
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   year          533660 non-null  category
 1   brand         533660 non-null  string  
 2   model         533660 non-null  string  
 3   type          533660 non-null  string  
 4   state         533660 non-null  string  
 5   condition     533660 non-null  category
 6   miles         533660 non-null  float32 
 7   color         533660 non-null  string  
 8   interior      533660 non-null  string  
 9   seller        533660 non-null  string  
 10  sellingprice  533660 non-null  int32   
dtypes: category(2), float32(1), int32(1), string(7)
memory usage: 37.7 MB


In [21]:
df.head()

Unnamed: 0,year,brand,model,type,state,condition,miles,color,interior,seller,sellingprice
0,2015,kia,sorento,suv,ca,5.0,16639.0,white,black,"kia motors america, inc",21500
1,2015,kia,sorento,suv,ca,5.0,9393.0,white,beige,"kia motors america, inc",21500
2,2014,bmw,3 series,sedan,ca,4.5,1331.0,gray,black,financial services remarketing (lease),30000
3,2015,volvo,s60,sedan,ca,4.1,14282.0,white,black,volvo na rep/world omni,27750
4,2014,bmw,6 series gran coupe,sedan,ca,4.3,2641.0,gray,black,financial services remarketing (lease),67000


In [29]:
#df["seller"].unique()

<StringArray>
[                           'kia motors america, inc',
             'financial services remarketing (lease)',
                            'volvo na rep/world omni',
 'enterprise vehicle exchange / tra / rental / tulsa',
                              'the hertz corporation',
                                 'audi mission viejo',
                                 'd/m auto sales inc',
                                  'desert auto trade',
                                 'kia motors finance',
                              'audi north scottsdale',
 ...
                          'larry h. miller chevrolet',
                            'eynon pontiac buick inc',
                                'auto revolution llc',
                            't&s california auto inc',
                                  'magnum motors llc',
                                 'a-1 auto group llc',
                        'g brothers auto brokers inc',
                         'maserati north ameri

## Data Splitting

In [21]:
# Make validation data
df_val = df.sample(frac=0.2, random_state=1337)

# Create training data
df_train = df.drop(df_val.index)

In [22]:
print(
    "Using %d samples for training and %d for validation"
    % (len(df_train), len(df_val))
)

Using 426928 samples for training and 106732 for validation


## Transform to Tensors

In [23]:
# Define a function to create our tensors

def dataframe_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop(y_label) #y_label rausziehen und löschen
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels)) #ds für tensoren
    if shuffle:
        ds = ds.shuffle(buffer_size=10000) #len(dataframe)
    ds = ds.batch(batch_size)
    df = ds.prefetch(batch_size)
    return ds

In [24]:
batch_size = 32

ds_train = dataframe_to_dataset(df_train, shuffle=True, batch_size=batch_size)
ds_val = dataframe_to_dataset(df_val, shuffle=True, batch_size=batch_size)

2022-05-11 13:20:06.340603: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  raise TypeError("Could not build a `TypeSpec` for {} with type {}".format(
  raise TypeError("Could not build a `TypeSpec` for {} with type {}".format(


In [25]:
ds_train

<BatchDataset shapes: ({year: (None,), brand: (None,), model: (None,), type: (None,), state: (None,), condition: (None,), miles: (None,), color: (None,), interior: (None,), seller: (None,)}, (None,)), types: ({year: tf.int64, brand: tf.string, model: tf.string, type: tf.string, state: tf.string, condition: tf.float64, miles: tf.float32, color: tf.string, interior: tf.string, seller: tf.string}, tf.int32)>

# Feature preprocessing
### Numerical preprocessing function

In [26]:
# Define numerical preprocessing function
def get_normalization_layer(name, dataset):
    
    # Create a Normalization layer for our feature
    normalizer = layers.Normalization(axis=None)

    # Prepare a dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    return normalizer

### Categorical preprocessing function

In [27]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens) #, output_mode='multi_hot'

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

### Data preprocessing

In [28]:
all_inputs = []
encoded_features = []

### Numercial preprocessing

In [29]:
# Numerical features
for feature in list_num:
  numeric_feature = tf.keras.Input(shape=(1,), name=feature)
  normalization_layer = get_normalization_layer(feature, ds_train)
  encoded_numeric_feature = normalization_layer(numeric_feature)
  all_inputs.append(numeric_feature)
  encoded_features.append(encoded_numeric_feature)

In [30]:
encoded_features

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization')>]

### Categorical preprocessing

In [31]:
for feature in list_cat_int:
  categorical_feature = tf.keras.Input(shape=(1,), name=feature, dtype='int32')
  encoding_layer = get_category_encoding_layer(name=feature,
                                               dataset=ds_train,
                                               dtype='int32',
                                               max_tokens=None)
  encoded_categorical_feature = encoding_layer(categorical_feature)
  all_inputs.append(categorical_feature)
  encoded_features.append(encoded_categorical_feature)

In [32]:
for feature in list_cat_string:
  categorical_feature = tf.keras.Input(shape=(1,), name=feature, dtype='string')
  encoding_layer = get_category_encoding_layer(name=feature,
                                               dataset=ds_train,
                                               dtype='string',
                                               max_tokens=None)
  encoded_categorical_feature = encoding_layer(categorical_feature)
  all_inputs.append(categorical_feature)
  encoded_features.append(encoded_categorical_feature)

In [33]:
#Merge
all_features = layers.concatenate(encoded_features)

In [34]:
all_features


<KerasTensor: shape=(None, 12699) dtype=float32 (created by layer 'concatenate')>

In [35]:
# First layer
x = layers.Dense(32, activation="relu")(all_features)

# Dropout to prevent overvitting - soll sich auf die wichtigsten konzentrieren
x = layers.Dropout(0.5)(x)

# Output layer
output = layers.Dense(1)(x) #sigmoid nur für Classifikation // bei regression keine activation

# Group all layers 
model = tf.keras.Model(all_inputs, output)

In [36]:
model.compile(optimizer="adam", 
              loss ="mse", 
              metrics=["mean_absolute_error"])
              
              #regression Metrics verwenden!!!!

In [37]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


## Training

In [40]:
model.fit(ds_train, epochs=50, validation_data=ds_val)
#Anzahl der Epochen: sobald val_accuracy nicht mehr gesteigert werden kann
#4 Epochen sind genug

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f9de085ae80>

In [41]:
#im "echten" die testdaten nehmen
loss, accuracy = model.evaluate(ds_val)

print("MAE", round(accuracy, 2))

MAE 1814.64


## Perform inference

In [42]:
model.save('my_car_model-mean-absolute')

2022-05-11 14:16:36.616569: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: my_car_model-mean-absolute/assets


In [43]:
reloaded_model = tf.keras.models.load_model('my_car_model-mean-absolute')

In [44]:
df.head()

Unnamed: 0,year,brand,model,type,state,condition,miles,color,interior,seller,sellingprice
0,2015,kia,sorento,suv,ca,5.0,16639.0,white,black,"kia motors america, inc",21500
1,2015,kia,sorento,suv,ca,5.0,9393.0,white,beige,"kia motors america, inc",21500
2,2014,bmw,3 series,sedan,ca,4.5,1331.0,gray,black,financial services remarketing (lease),30000
3,2015,volvo,s60,sedan,ca,4.1,14282.0,white,black,volvo na rep/world omni,27750
4,2014,bmw,6 series gran coupe,sedan,ca,4.3,2641.0,gray,black,financial services remarketing (lease),67000


In [49]:
sample = {
    "year": 2015,
    "brand": "kia",
    "model": "sorento",
    "type": "suv",
    "state": "ca",
    "condition": 5.0,
    "miles": 9393.0,
    "color": "white",
    "interior": "black",
    "seller": "kia motors america, inc",
}

In [50]:
input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}

In [51]:
predictions = reloaded_model.predict(input_dict)

In [52]:
predictions

array([[21473.633]], dtype=float32)


```{toctree}
:hidden:
:titlesonly:


05_deployment
```
