# Import everything needed

In [13]:
import json
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

# Data preparation
If you want to have your own wristwatch data, feel free to follow see my GitHub. It contains the all Python scripts you need for the data to be fully prepared and ready to use here.

Currently we will stick to already prepared ones which are here (sample_data folder).

In [14]:
with open("sample_data/converted_wristwatch_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
df = pd.DataFrame(data)

### Define features for the training

In [15]:
numeric_features = ["Vodotěsnost", "Rozměr pouzdra", "Hmotnost"]
multi_label_features = ["Použití", "Funkce"]
single_label_features = [
    "Značka", "Určení", "Pohon", "Materiál pouzdra", "Materiál sklíčka",
    "Číselník", "Tvar pouzdra", "Baterie", "Původ", "Strojek",
    "Akumulátor", "Barva náramku", "Barva číselníku"
]

### Clean and normalize

In [16]:
# Multi-label: ensure they are lists
for col in multi_label_features:
    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [x])

# Single-label: unwrap single string values from lists
for col in single_label_features:
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) and x else None)

### Use MultiLabelBinazier to encode the Multi-labeled features

In [17]:
mlb_functions = MultiLabelBinarizer()
functions_encoded = pd.DataFrame(mlb_functions.fit_transform(df["Funkce"]), columns=mlb_functions.classes_)

mlb_usage = MultiLabelBinarizer()
usage_encoded = pd.DataFrame(mlb_usage.fit_transform(df["Použití"]), columns=mlb_usage.classes_)

### Fill the empty records with NaNs
To ensure OneHotEncoder can handle them properly.

In [18]:
df[single_label_features] = df[single_label_features].fillna("Neznámé")

### Use OneHotencoder to encode the Single-labeled features

In [19]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_array = ohe.fit_transform(df[single_label_features])

# Convert the encoded array to a DataFrame, ensuring the column names match the features
encoded_single_labels = pd.DataFrame(
    encoded_array,
    columns=ohe.get_feature_names_out(single_label_features),
    index=df.index
)

### Merge the input features all together and set target

In [20]:
X = pd.concat(
    [df[numeric_features], functions_encoded, encoded_single_labels],
    axis=1
)
y = usage_encoded


### Split the data

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model
You can change the n_estimators if wanted. 100 worked the best for me the same as it was in price prediction model.

In [24]:
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evalutation
We can see how well our model did on the test data.

In [26]:
test_predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions, average='micro')
report = classification_report(y_test, test_predictions, zero_division=0)

print(f"Testing Accuracy: {accuracy:.2f}")
print(f"Testing F1 Score: {f1:.2f}")
print("Classification Report:\n", report)

Testing Accuracy: 0.93
Testing F1 Score: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2
           2       1.00      0.40      0.57        10
           3       1.00      0.50      0.67         4
           4       1.00      1.00      1.00         1
           5       0.00      0.00      0.00         1
           6       0.98      0.91      0.94        55
           7       0.00      0.00      0.00         1
           8       1.00      0.50      0.67         2
           9       0.00      0.00      0.00         1
          10       1.00      0.94      0.97        17
          11       1.00      1.00      1.00         1
          12       0.99      0.98      0.98       274
          13       0.98      0.97      0.98       149
          14       1.00      1.00      1.00         5
          15       1.00      1.00      1.00         2
          1

# Export the model using pickle library
We also need to save/export the used encoders.

In [27]:
model_data = {
    "model": model,
    "features": list(X.columns),
    "numeric_features": numeric_features,
    "categorical_features": single_label_features,
    "mlb_funkce": mlb_functions,
    "mlb_pouziti": mlb_usage,
    "ohe": ohe
}

with open("trained models/rfc_wristwatch_usage_model.pkl", "wb") as f:
    pickle.dump(model_data, f)