# Import everything needed



In [1]:
import json
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


# Data preparation
If you want to have your own wristwatch data, feel free to follow see my GitHub. It contains the all Python scripts you need for the data to be fully prepared and ready to use here.

Currently we will stick to already prepared ones which are here (sample_data folder).

In [2]:
with open("sample_data/converted_wristwatch_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
df = pd.DataFrame(data)

### Define features for the training

In [3]:
numeric_features = ["Vodotěsnost", "Rozměr pouzdra", "Hmotnost"]
multi_label_features = ["Funkce"]
single_label_features = [
    "Značka", "Určení", "Pohon", "Materiál pouzdra", "Materiál sklíčka",
    "Číselník", "Tvar pouzdra", "Baterie", "Původ", "Strojek",
    "Akumulátor", "Barva náramku", "Barva číselníku"
]

### Clean and normalize

In [4]:
for col in multi_label_features:
    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])

for col in single_label_features:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and x else None)


### Use MultiLabelBinazier to encode the Multi-labeled features

In [5]:
mlb_functions = MultiLabelBinarizer()
functions_encoded = pd.DataFrame(mlb_functions.fit_transform(df["Funkce"]), columns=mlb_functions.classes_)


### Fill the empty records with NaNs
To ensure OneHotEncoder can handle them properly.

In [6]:
df[single_label_features] = df[single_label_features].fillna("Neznámé")


### Use OneHotencoder to encode the Single-labeled features

In [7]:
ohe = OneHotEncoder(handle_unknown='ignore')
encoded_array = ohe.fit_transform(df[single_label_features])
encoded_single_labels = pd.DataFrame(
    encoded_array.toarray(),  # Convert sparse matrix to dense
    columns=ohe.get_feature_names_out(single_label_features),
    index=df.index  # Ensure the index matches the original DataFrame
)

### Merge the input features all together and set target

In [8]:
X = pd.concat(
    [df[numeric_features], functions_encoded, encoded_single_labels],
    axis=1
)

y = df["Cena"]

### Split the data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model
You can change the n_estimators if wanted. 100 went the best for me.

In [16]:
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evalutation
We can see how well our model did on the test data.

In [17]:
test_predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f"Testing Mean Absolute Error: {test_mae:.2f}")
print(f"Testing Mean Squared Error: {test_mse:.2f}")
print(f"Testing R2 Score: {test_r2:.2f}")

Testing Mean Absolute Error: 1617.22
Testing Mean Squared Error: 24891634.78
Testing R2 Score: 0.92


# Export the model using pickle library
We also need to save/export the used encoders.

In [25]:
model_data = {
    "model": model,
    "features": list(X.columns),
    "numeric_features": numeric_features,
    "categorical_features": single_label_features,
    "mlb_funkce": mlb_functions,
    "ohe": ohe
}

with open("trained models/rfr_wristwatch_price_model.pkl", "wb") as f:
    pickle.dump(model_data, f)