In [None]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = pd.read_csv("data/products.csv")
df.head()


In [None]:
df = df[['Product Title', 'Category Label']]
df.dropna(inplace=True)
df['Product Title'] = df['Product Title'].str.lower()

df.head()


In [None]:
X = df['Product Title']
y = df['Category Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=50000)),
    ('clf', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)


In [None]:
predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))


In [None]:
model.predict(["iphone 7 32gb gold"])


In [None]:
with open("models/product_category_model.pkl", "wb") as f:
    pickle.dump(model, f)
