<a href="https://colab.research.google.com/github/LuGor17/product-category-classifier/blob/main/analysis_and_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv("products.csv")
df.head()

Unnamed: 0,product ID,Product Title,Merchant ID,Category Label,_Product Code,Number_of_Views,Merchant Rating,Listing Date
0,1,apple iphone 8 plus 64gb silver,1.0,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024
1,2,apple iphone 8 plus 64 gb spacegrau,2.0,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3.0,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024
3,4,apple iphone 8 plus 64gb space grey,4.0,Mobile Phones,YI-0086-US,466.0,3.4,5/2/2022
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5.0,Mobile Phones,NZ-3586-WP,4426.0,1.6,4/12/2023


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29231 entries, 0 to 29230
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product ID       29231 non-null  int64  
 1   Product Title    29092 non-null  object 
 2   Merchant ID      29230 non-null  float64
 3    Category Label  29192 non-null  object 
 4   _Product Code    29154 non-null  object 
 5   Number_of_Views  29219 non-null  float64
 6   Merchant Rating  29092 non-null  float64
 7    Listing Date    29179 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 1.8+ MB


In [3]:
df.isnull().sum()

Unnamed: 0,0
product ID,0
Product Title,139
Merchant ID,1
Category Label,39
_Product Code,77
Number_of_Views,12
Merchant Rating,139
Listing Date,52


In [4]:
df["Product Title"] = (
    df["Product Title"]
    .astype(str)
    .str.lower()
    .str.replace(r"[^a-z0-9 ]", " ", regex=True)
)

In [8]:
df = df.dropna(subset=[" Category Label"])

In [9]:
df["title_length"] = df["Product Title"].apply(len)
df["word_count"] = df["Product Title"].apply(lambda x: len(x.split()))
df["has_number"] = df["Product Title"].str.contains(r"\d", regex=True).astype(int)
df["longest_word"] = df["Product Title"].apply(lambda x: max(len(w) for w in x.split()))

In [11]:
X = df["Product Title"]
y = df[" Category Label"]

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        max_features=20000
    )),
    ("clf", RandomForestClassifier(n_estimators=300))
])

In [15]:
model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import classification_report, accuracy_score

preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Accuracy: 0.9623223154649769
                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      0.99      0.98       754
 Digital Cameras       1.00      0.99      0.99       539
     Dishwashers       0.95      0.94      0.95       684
        Freezers       0.94      0.92      0.93       442
 Fridge Freezers       0.97      0.94      0.96       600
      Microwaves       0.98      0.96      0.97       468
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.96      0.99      0.97       804
             TVs       0.96      0.98      0.97       713
Washing Machines       0.94      0.96      0.95       807

        accuracy                           0.96      5839
       macro avg       0.79      0.79      0.79      5839
    weighted avg       0.96      0.96      0.96      5839



In [17]:
import pickle

with open("product_category_model.pkl", "wb") as f:
    pickle.dump(model, f)