In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
products_path = "../data/raw/products.json.gz"
data_pre_processing_folder =  "../data/pre-processing/"
data_pre_processing_name = "data.csv"
label_pre_processing_name = "label.csv"
label_encoder_path = "../models/encoder/label_encoder.pkl"

In [3]:
products = pd.read_json(products_path, compression="gzip")

In [4]:
products["category_id"] = products["category"].apply(lambda x: x[-1]["id"])

In [5]:
min_value = 100
other_cat_value = "other"
fill_nan_values_string = 'null'

In [6]:
unique_category = products["category_id"].value_counts(normalize=False)
valid_category = unique_category[unique_category>=min_value]

In [7]:
products["label"] = products["category_id"].where(products["category_id"].isin(valid_category.index), other_cat_value)

In [8]:
features = ["name", "description"]

In [9]:
for feature, type in products[features].dtypes.to_dict().items():
    print(feature, type)
    products[feature] = products[feature].fillna(fill_nan_values_string)

name object
description object


In [10]:
products["text"] = products["name"] + " " + products["description"]

In [11]:
le = LabelEncoder()

In [12]:
le.fit(products["label"])

In [13]:
products["label"] = le.transform(products["label"])

In [14]:
data = products["text"]
label = products["label"]

In [15]:
data.shape, label.shape

((51646,), (51646,))

In [16]:
assert data.shape[0] == label.shape[0]
assert data.isna().sum().sum() == 0
assert label.isna().sum().sum() == 0

In [17]:
if not os.path.exists(data_pre_processing_folder):
    os.makedirs(data_pre_processing_folder)
data.to_csv(data_pre_processing_folder+data_pre_processing_name, index=None)
label.to_csv(data_pre_processing_folder+label_pre_processing_name, index=None)

In [18]:
output = open(label_encoder_path, 'wb')
pickle.dump(le, output)
output.close()