# 0. Meta

## 0.1. Packages

In [10]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

## 0.2. Functions

# 1. Data Import

In [11]:
df_raw = pd.read_csv("../data/raw/autoscout24.csv")

A duplicate of the raw data is created in order to save the changes made to the original data during the pre-processing steps.

In [12]:
df_interim = df_raw.copy()

Create lists storing target ('price'), as well as continuous ('mileage', 'hp', 'year'), ordinal ('offerType') and nominal ('make', 'model', 'fuel', 'gear') features.

In [13]:
target = ['price']
continuous_features = ['mileage', 'hp', 'year']
ordinal_features = ['offerType']
nominal_features = ["make", "model", "fuel", "gear"]

# 2. Data Cleaning

## 2.1. Missing Entries 

Initial data exploration revealed that the number of missing entries is negligible, justifying their removal. It is assumed that the removal of this small proportion will only have a minimal impact on the overall quality of the dataset.

In [14]:
df_interim.dropna(inplace=True, ignore_index=True)

# 3. Categorical Feature Encoding

## 3.1. Ordinal Encoding

Initial data exploration indicates that the "offerType" variable follows an ordinal scale, with the following hierarchical order: 'Used' < "Employee's car" < 'Demonstration' < 'Pre-registered' < 'New'. Thus, ordinal encoding will be applied accordingly.

In [15]:
offer_type_order = ['Used', "Employee's car", 'Demonstration', 'Pre-registered', 'New']
ordinal_encoder = OrdinalEncoder(categories=[offer_type_order])

# Create a DataFrame with the original 'offerType' column
df_ordinal_full = pd.DataFrame(df_interim[ordinal_features])

# Perform ordinal encoding and create a new DataFrame
df_interim[ordinal_features] = ordinal_encoder.fit_transform(df_interim[ordinal_features])


## 3.2. One Hot Encoding

Perform one hot encoding for the nominal features.

In [16]:
print(f"Shape of data before One Hot Encoding: {df_interim.shape}")

if any(col in df_interim.columns for col in nominal_features):
    # Create a new DataFrame with only the one-hot encoded columns
    df_one_hot = pd.get_dummies(df_interim[nominal_features], columns=nominal_features, dtype=int)

    # Drop the original nominal columns from the original DataFrame
    df_interim = df_interim.drop(columns=nominal_features)

    # Concatenate the original DataFrame without nominal columns and the new DataFrame with one-hot encoded columns
    df_interim = pd.concat([df_interim, df_one_hot], axis=1)

print(f"Shape of data after One Hot Encoding: {df_interim.shape}")

Shape of data before One Hot Encoding: (46071, 9)
Shape of data after One Hot Encoding: (46071, 925)


# 4. Train Test Split

# 5. Continuous Variable Transformation

# 6. Feature Scaling