In [None]:
import re, string, nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# === Setup NLP tools ===
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(s: str) -> str:
    """Lowercase, remove punctuation, remove stopwords, lemmatize."""
    s = str(s).lower()
    s = re.sub(f"[{re.escape(string.punctuation)}]", " ", s)
    toks = [lemmatizer.lemmatize(w) for w in s.split() if w not in stop_words]
    return " ".join(toks)

# === Load Dataset ===
df = pd.read_csv('/workspace/medhansh/ikarus/intern_data_ikarus.csv')
df.fillna('', inplace=True)

# Combine relevant text fields
text_fields = ['title', 'brand', 'description', 'categories', 'material', 'color']
df['combined_text'] = df[text_fields].astype(str).agg(' '.join, axis=1)

# Clean text using NLP preprocessing
df['cleaned_text'] = df['combined_text'].apply(clean_text)

print("Text cleaned. Example:")
print(df['cleaned_text'].head(3).tolist())

# === Create Semantic Text Embeddings ===
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
text_embeddings = model.encode(df['cleaned_text'].tolist(), show_progress_bar=True, normalize_embeddings=True)

# Save for backend use
np.save('/workspace/medhansh/ikarus/text_embeddings.npy', text_embeddings)
df.to_csv('/workspace/medhansh/ikarus/products_text.csv', index=False)

print(f"\n Generated {text_embeddings.shape[0]} embeddings of dimension {text_embeddings.shape[1]}")

# === NLP Grouping (KMeans Clustering) ===
k = 10  # number of semantic groups
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(text_embeddings)

# View top few items from each cluster
for i in range(k):
    print(f"\nCluster {i}:")
    print(df[df['cluster'] == i]['title'].head(3).tolist())

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Text cleaned. Example:
['goymfk 1pc free standing shoe rack multi layer metal shoe cap rack 8 double hook living room bathroom hallway goymfk multiple shoe coat hat item easy assemble includes necessary hardware instruction easy assembly versatile perfect use living room bathroom hallway home kitchen storage organization clothing closet storage shoe organizer free standing shoe rack metal white', 'subrtex leather ding room dining chair set 2 black subrtex subrtex dining chair set 2 home kitchen furniture dining room furniture chair sponge black', 'plant repotting mat muyetol waterproof transplanting mat indoor 26 8 x 26 8 portable square foldable easy clean gardening work mat soil changing mat succulent plant transplanting mat garden gift muyetol patio lawn garden outdoor décor doormat polyethylene green']



atches: 100%|██████████| 10/10 [00:00<00:00, 43.01it/s]


✅ Generated 312 embeddings of dimension 384

Cluster 0:
['Kingston Brass BA1752BB Heritage 18-Inch Towel-Bar, Brushed Brass', 'Chief Mfg.Swing-Arm Wall Mount Hardware Mount Black (TS218SU)', 'LASCO 35-5019 Hallmack Style 24-Inch Towel Bar Accessory, All Metal Construction, Chrome Plated Finish']

Cluster 1:
['Plant Repotting Mat MUYETOL Waterproof Transplanting Mat Indoor 26.8" x 26.8" Portable Square Foldable Easy to Clean Gardening Work Mat Soil Changing Mat Succulent Plant Transplanting Mat Garden Gifts', 'Pickleball Doormat, Welcome Doormat Absorbent Non-Slip Floor Mat Bathroom Mat 16x24', 'Plant Repotting Mat MUYETOL Waterproof Transplanting Mat Indoor 26.8" x 26.8" Portable Square Foldable Easy to Clean Gardening Work Mat Soil Changing Mat Succulent Plant Transplanting Mat Garden Gifts']

Cluster 2:
['JOIN IRON Foldable TV Trays for Eating Set of 4 with Stand,Folding TV/Snack Tray Table Set,Folding TV Dinner Tables for Small Space,(Grey)', 'JOIN IRON Foldable TV Trays for Eating

In [2]:
# Quick semantic similarity example
query = "modern wooden chair"
q_vec = model.encode([query], normalize_embeddings=True)
similarities = cosine_similarity(q_vec, text_embeddings)[0]
top_idx = np.argsort(similarities)[::-1][:5]
df.iloc[top_idx][['title', 'brand', 'price']]

Unnamed: 0,title,brand,price
158,"Leather At Home, Decorative 13 Inch Rounded Pi...",Leather At Home Store,$26.49
72,Adeco Euro Style Fabric Arm Bench Chair Footst...,Adeco Store,
201,PONTMENT Foot Stool Leather Footstool Solid Wo...,PONTMENT,$95.99
43,Black Leather Office Chair Mid Back Leather De...,Arts wish Store,$89.98
148,AnRui Folding Floor Chair with Adjustable Back...,AnRui Store,$52.99


In [3]:
import os

# --- Clean up any bad hidden unicode characters ---
def clean_env(var):
    v = os.getenv(var, "")
    if v:
        v = v.encode("ascii", "ignore").decode("ascii").strip()
        os.environ[var] = v

# Reset both Pinecone env vars
os.environ["PINECONE_API_KEY"] = "pcsk_23wW6M_2RHGa3MT5rjdyi2q4oLp3MGYrdF2M5KV3xijynkXFV2DgjYWskCr1AgNw7zaPdk".strip()
os.environ["PINECONE_ENV"] = "us-east-1"

clean_env("PINECONE_API_KEY")
clean_env("PINECONE_ENV")


In [None]:
# === Store text embeddings in Pinecone ===

from pinecone import Pinecone, ServerlessSpec
import numpy as np
import pandas as pd
import os

# Load dataset and embeddings
df = pd.read_csv('/workspace/medhansh/ikarus/products_text.csv').fillna('')
text_embeddings = np.load('/workspace/medhansh/ikarus/text_embeddings.npy')

# Connect to Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Define index name
index_name = "ikarus3d"

# Create index if not exists
if index_name not in [i['name'] for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=text_embeddings.shape[1],
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=os.getenv("PINECONE_ENV", "us-east-1"))
    )

# Connect to the index
index = pc.Index(index_name)

# Prepare and upload data (batch-wise for safety)
vectors = []
for i, row in df.iterrows():
    meta = {
        "uniq_id": str(row.get("uniq_id", i)),
        "title": str(row.get("title", "")),
        "brand": str(row.get("brand", "")),
        "price": str(row.get("price", "")),
        "categories": str(row.get("categories", "")),
        "material": str(row.get("material", "")),
        "color": str(row.get("color", "")),
    }
    vectors.append({
        "id": meta["uniq_id"],
        "values": text_embeddings[i].tolist(),
        "metadata": meta
    })

# Upload in chunks
batch_size = 100
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch)
    print(f"Uploaded {i+len(batch)}/{len(vectors)} vectors")

print("All embeddings uploaded to Pinecone index:", index_name)

✅ Uploaded 100/312 vectors
✅ Uploaded 200/312 vectors
✅ Uploaded 300/312 vectors
✅ Uploaded 312/312 vectors
🎯 All embeddings uploaded to Pinecone index: ikarus3d


In [6]:
query = "chair"
q_vec = model.encode([query], normalize_embeddings=True)[0]

# Search top 5
result = index.query(vector=q_vec.tolist(), top_k=5, include_metadata=True)

for match in result["matches"]:
    print(f"\nScore: {match['score']:.3f}")
    meta = match["metadata"]
    print(f"Title: {meta['title']}")
    print(f"Brand: {meta['brand']}")
    print(f"Price: {meta['price']}")


Score: 0.540
Title: AnRui Folding Floor Chair with Adjustable Back Support, Comfortable, Semi-Foldable, and Versatile, for Meditation, Seminars, Reading, TV Watching or Gaming, Suitable for Home Or Office
Brand: AnRui Store
Price: $52.99

Score: 0.535
Title: UTONE Gaming Chair Computer Chair Breathable Fabric Office Chair Cloth with Backrest Desk Chair with Footrest, Lumbar Support Swivel Recliner Task Chair Ergonomic Video Game Chair Height Adjustable
Brand: UTONE
Price: $199.99

Score: 0.524
Title: Leather At Home, Decorative 13 Inch Rounded Pillow Handmade from Full Grain Leather - Chair Seat, Confortable Sitting for Round Wooden/Metal Stools - Bourbon Brown
Brand: Leather At Home Store
Price: $26.49

Score: 0.521
Title: BOOSDEN Padded Folding Chair 2 Pack, Foldable Chair with Thick Cushion, Heavy Duty Metal Folding Chair for Outdoor & Indoor & Dining & Party, Red
Brand: BOOSDEN Store
Price: $119.00

Score: 0.521
Title: MoNiBloom Massage Gaming Recliner Chair with Speakers PU Leath

In [None]:
# ============================================
# 1. Imports
# ============================================
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import ast
import numpy as np
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

# ============================================
# 2. Load Dataset
# ============================================
df = pd.read_csv("/workspace/medhansh/ikarus/products_text.csv")

# Filter top 5 categories
top5 = df['categories'].value_counts().nlargest(5).index
df = df[df['categories'].isin(top5)].reset_index(drop=True)
print("Using", len(df), "products from top 5 categories:")
print(df['categories'].unique())

# ============================================
# 3. Download & preprocess images
# ============================================
images = []
labels = []

print("\n📸 Downloading and processing images...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    raw_urls = row['images']
    label = row['categories']

    # Parse list of URLs if needed
    try:
        url_list = ast.literal_eval(raw_urls) if isinstance(raw_urls, str) else raw_urls
        if isinstance(url_list, list) and len(url_list) > 0:
            url = url_list[0].strip()  # pick first URL
        else:
            continue
    except Exception:
        continue

    # Download image
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        img = Image.open(BytesIO(r.content)).convert("RGB")
        img = img.resize((224, 224))  # ResNet50 input size
        img_arr = np.array(img)
        images.append(img_arr)
        labels.append(label)
    except Exception as e:
        print(f"Failed to load {url}: {e}")

X = np.array(images)
y = np.array(labels)

print("Loaded image array:", X.shape)

# ============================================
# 4. Encode labels
# ============================================
if len(y) == 0:
    raise ValueError("No images were loaded. Check URLs or dataset format.")

le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(le.classes_)
print("Classes:", le.classes_)

# ============================================
# 5. Train/Test Split
# ============================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# Preprocess for ResNet50
X_train = preprocess_input(X_train)
X_test = preprocess_input(X_test)

# One-hot encode labels
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes)

print("Train set:", X_train.shape, "Test set:", X_test.shape)

# ============================================
# 6. Build Transfer Learning Model (ResNet50)
# ============================================
base_model = ResNet50(
    weights='imagenet', include_top=False, input_shape=(224, 224, 3)
)
base_model.trainable = False  # Freeze base layers

model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ============================================
# 7. Save Best Model Callback
# ============================================
checkpoint_cb = ModelCheckpoint(
    filepath="resnet50_best.h5",
    monitor="val_accuracy",
    save_best_only=True,
    mode="max",
    verbose=1
)

# ============================================
# 8. Data Augmentation
# ============================================
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)
datagen.fit(X_train)

# ============================================
# 9. Train the Model
# ============================================
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=8),
    validation_data=(X_test, y_test_cat),
    epochs=100,
    callbacks=[checkpoint_cb]
)

# ============================================
# 10. Evaluate & Classification Report
# ============================================
loss, acc = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {acc*100:.2f}%")

# Predict classes
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification report
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# ============================================
# 11. Save Final Model
# ============================================
model.save("resnet50_product_classifier_final.h5")
print("Final model saved as resnet50_product_classifier_final.h5")
print("Best model saved as resnet50_best.h5")

TensorFlow version: 2.20.0
GPU available: []
✅ Using 102 products from top 5 categories:
["['Patio, Lawn & Garden', 'Outdoor Décor', 'Doormats']"
 "['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Tables', 'End Tables']"
 "['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Ottomans']"
 "['Home & Kitchen', 'Home Décor Products', 'Mirrors', 'Wall-Mounted Mirrors']"
 "['Home & Kitchen', 'Furniture', 'Game & Recreation Room Furniture', 'Home Bar Furniture', 'Barstools']"]

📸 Downloading and processing images...



00%|██████████| 102/102 [00:06<00:00, 15.61it/s]

✅ Loaded image array: (102, 224, 224, 3)
✅ Classes: ["['Home & Kitchen', 'Furniture', 'Game & Recreation Room Furniture', 'Home Bar Furniture', 'Barstools']"
 "['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Ottomans']"
 "['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Tables', 'End Tables']"
 "['Home & Kitchen', 'Home Décor Products', 'Mirrors', 'Wall-Mounted Mirrors']"
 "['Patio, Lawn & Garden', 'Outdoor Décor', 'Doormats']"]
✅ Train set: (81, 224, 224, 3) Test set: (21, 224, 224, 3)


Epoch 1/100


  self._warn_if_super_not_called()



Epoch 1: val_accuracy improved from None to 0.66667, saving model to resnet50_best.h5
0.3568 - loss: 1.8905



[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 720ms/step - accuracy: 0.4444 - loss: 1.6197 - val_accuracy: 0.6667 - val_loss: 0.7976
Epoch 2/100

Epoch 2: val_accuracy improved from 0.66667 to 0.80952, saving model to resnet50_best.h5
850 - loss: 0.4685



[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 459ms/step - accuracy: 0.8642 - loss: 0.4353 - val_accuracy: 0.8095 - val_loss: 0.3659
Epoch 3/100

Epoch 3: val_accuracy did not improve from 0.80952
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 425ms/step - accuracy: 0.9506 - loss: 0.2464 - val_accuracy: 0.7143 - val_loss: 0.6729
Epoch 4/100

Epoch 4: val_accuracy did not improve from 0.80952
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 425ms/step - accuracy: 0.9383 - loss: 0.2301 - val_accuracy: 0.7619 - val_loss: 0.6399
Epoch 5/100

Epoch 5: val_accuracy improved from 0.80952 to 0.90476, saving model to resnet50_best.h5
870 - loss: 0.0594



[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 461ms/step - accuracy: 0.9506 - loss: 0.1357 - val_accuracy: 0.9048 - val_loss: 0.3936
Epoch 6/100

Epoch 6: val_accuracy did not improve from 0.90476
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 445ms/step - accuracy: 1.0000 - loss: 0.0791 - val_accuracy: 0.7619 - val_loss: 0.4961
Epoch 7/100

Epoch 7: val_accuracy did not improve from 0.90476
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 424ms/step - accuracy: 0.9753 - loss: 0.1005 - val_accuracy: 0.8095 - val_loss: 0.4679
Epoch 8/100

Epoch 8: val_accuracy did not improve from 0.90476
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 425ms/step - accuracy: 0.9753 - loss: 0.0622 - val_accuracy: 0.8095 - val_loss: 0.5232
Epoch 9/100

Epoch 9: val_accuracy did not improve from 0.90476
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 465ms/step - accuracy: 0.9877 - loss: 0.0434 - val_accuracy: 0.7619 - val_lo



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step





📊 Classification Report:
                                                                                                        precision    recall  f1-score   support

['Home & Kitchen', 'Furniture', 'Game & Recreation Room Furniture', 'Home Bar Furniture', 'Barstools']       0.80      1.00      0.89         4
                                  ['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Ottomans']       0.57      1.00      0.73         4
                      ['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Tables', 'End Tables']       1.00      0.25      0.40         4
                          ['Home & Kitchen', 'Home Décor Products', 'Mirrors', 'Wall-Mounted Mirrors']       1.00      1.00      1.00         4
                                                 ['Patio, Lawn & Garden', 'Outdoor Décor', 'Doormats']       1.00      0.80      0.89         5

                                                                                              accuracy      

In [None]:
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report
import numpy as np

# Load best checkpoint
best = load_model("/workspace/medhansh/ikarus/resnet50_best.h5")

# Preprocess test set (ResNet50)
X_test_pp = preprocess_input(X_test.astype(np.float32))

# Evaluate accuracy
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=len(le.classes_))
loss, acc = best.evaluate(X_test_pp, y_test_cat, verbose=0)
print(f"Best checkpoint — Test accuracy: {acc*100:.2f}%")

# Classification report
y_pred = best.predict(X_test_pp, batch_size=8)
y_pred_labels = y_pred.argmax(axis=1)
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred_labels, target_names=list(le.classes_), digits=4))



✅ Best checkpoint — Test accuracy: 85.71%




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step   

📊 Classification Report:
                                                                                                        precision    recall  f1-score   support

['Home & Kitchen', 'Furniture', 'Game & Recreation Room Furniture', 'Home Bar Furniture', 'Barstools']     1.0000    1.0000    1.0000         4
                                  ['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Ottomans']     0.6667    0.5000    0.5714         4
                      ['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Tables', 'End Tables']     1.0000    1.0000    1.0000         4
                          ['Home & Kitchen', 'Home Décor Products', 'Mirrors', 'Wall-Mounted Mirrors']     0.6667    1.0000    0.8000         4
                                                 ['Patio, Lawn & Garden', 'Outdoor Décor', 'Doormats']     1.0000    0.8000    0.8889         5

                                   

In [None]:
import numpy as np
from tensorflow.keras.models import load_model

model = load_model("resnet50_best.h5")
classes = np.load("label_classes.npy", allow_pickle=True)
print("Loaded model and classes:", classes)



✅ Loaded model and classes: ["['Home & Kitchen', 'Furniture', 'Game & Recreation Room Furniture', 'Home Bar Furniture', 'Barstools']"
 "['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Ottomans']"
 "['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Tables', 'End Tables']"
 "['Home & Kitchen', 'Home Décor Products', 'Mirrors', 'Wall-Mounted Mirrors']"
 "['Patio, Lawn & Garden', 'Outdoor Décor', 'Doormats']"]


In [None]:
from PIL import Image
import numpy as np
from tensorflow.keras.applications.resnet50 import preprocess_input

# Path to new image
img_path = "/workspace/medhansh/ikarus/chair.jpeg"

# Load & preprocess the image
img = Image.open(img_path).convert("RGB")
img = img.resize((224, 224))
img_array = np.array(img)

# Add batch dimension and preprocess
img_array = np.expand_dims(img_array, axis=0)
img_array = preprocess_input(img_array)

# Predict
pred = model.predict(img_array)
pred_class_index = np.argmax(pred, axis=1)[0]
pred_class_name = classes[pred_class_index]

print("Predicted class:", pred_class_name)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
✅ Predicted class: ['Home & Kitchen', 'Furniture', 'Living Room Furniture', 'Ottomans']
