In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import requests
import re
from mapping import *


load_dotenv()

# Ollama_URL = os.getenv("Ollama_URL")
# print(f"Ollama URL: {Ollama_URL}")

True

## 2. Load Full House_Babylon CSV

In [2]:
# Read full CSV
file_path = "Draft/House_Babylon/House_Babylon.csv"
df = pd.read_csv(file_path)

# Clean column names (handle header row)
df.columns = df.iloc[0]
df = df.drop([0, 1]).reset_index(drop=True)

# Keep only needed columns
needed_columns = ["Item (EN)", "Description (EN)", "Category/Department (EN)", "Variant Name"]
df = df[needed_columns]

print(f"Full dataset shape: {df.shape}")


# df = df.iloc[::5].reset_index(drop=True)  # Sample every 5th row

df = df.iloc[[0, -1]].reset_index(drop=True)

df

Full dataset shape: (70, 4)


Unnamed: 0,Item (EN),Description (EN),Category/Department (EN),Variant Name
0,Albergo Stripe Set - Off White,A summer uniform reimagined in breathable Egyp...,Poplin Sets,XS/XS
1,Yemin W Shemal Socks,Crafted from a soft and durable blend of 80% c...,Socks,


In [5]:
import requests

API_URL = "http://100.75.237.4:11434/api/generate"
# OLLAMA_URL = "http://127.0.0.1:11434/v1/completions"

MODELS = {
    # "llama": ("llama3.1:8b", OLLAMA_URL, "choices", "text"),
    "mistral": ("mistral", API_URL, "response", None),
    "phi4": ("phi4:latest", API_URL, "response", None),
    "aya": ("aya:8b", API_URL, "response", None),
}

def run_model(model_name, prompt):
    model, url, key, subkey = MODELS[model_name]
    payload = {"model": model, "prompt": prompt, "max_tokens": 200, "stream": False}
    r = requests.post(url, json=payload)
    r.raise_for_status()
    data = r.json()
    return data[key][0][subkey].strip() if subkey else data[key].strip()

# Tests
# print("LLaMA Output:", run_model("llama", "hi"))
print("=" * 35)
print("Mistral Output:", run_model("mistral", "hi"))
print("=" * 35)
print("Phi4 Output:", run_model("phi4", "hi"))
print("=" * 35)
print("aya Output:", run_model("aya", "hi"))

Mistral Output: Hello! How can I help you today? Is there something specific you would like to know or talk about? I'm here to assist with any questions or topics you have in mind. Let me know if you need help with a project, need advice, or just want to chat about something interesting. 😊
Phi4 Output: Hello! How can I assist you today? Feel free to ask me anything. 😊
aya Output: Hi! How can I help you today?


## 5. Level 1: Shopping Category Classification (with Confidence)

In [8]:
def classify_shopping_category(item_name, description, vendor_category):
    """Classify item into shopping category (Level 1) with confidence"""
    text = f"""
    Item: {item_name}
    Description: {description}
    Vendor Category: {vendor_category}

    Task: Choose the best suited shopping category for this item.

    Allowed categories:
    {shoppingCategory}

    Rules:
    - Return the category name exactly as written, followed by your confidence
    - Format: category_name|confidence:XX% (where XX is 0-100)
    - Example: fashion|confidence:95%
    - If none fit, return empty|confidence:0%
    - Do not add quotes, extra text, or explanations
    """
    result = run_model("phi4", text)
    result = result.lower().replace("'", "").replace('"', "").replace(":", "").strip()
    
    # Parse category and confidence
    if "|confidence" in result:
        parts = result.split("|confidence")
        category = parts[0].strip()
        confidence = parts[1].strip().replace("%", "").strip()
        try:
            confidence = int(confidence)
        except:
            confidence = 0
    else:
        category = result.strip()
        confidence = 0
    
    # Validate category
    if category not in shoppingCategory:
        category = ""
        confidence = 0
    
    return category, confidence

In [9]:
# Apply classification to the whole DataFrame
df[["shoppingCategory", "confidence"]] = df.apply(lambda row: pd.Series(classify_shopping_category(row["Item (EN)"],row["Description (EN)"],row["Category/Department (EN)"])),axis=1)


In [None]:
# Import all mappings from Mapping.py
from Mapping import (
    shoppingCategory,
    shoppingSubcategory_map,
    itemCategory_map,
    itemSubcategory_map
)

## 6. Level 2: Shopping Subcategory Classification (with Confidence)

In [11]:
def classify_shopping_subcategory(shopping_category, item_name, description, vendor_category):
    """Classify item into shopping subcategory (Level 2) based on shopping category"""
    
    # If shopping_category is empty or not in map, return empty
    if shopping_category == "" or shopping_category not in shoppingSubcategory_map:
        return "", 0
    
    # Get the appropriate subcategory list
    subcategory_list = shoppingSubcategory_map[shopping_category]
    
    text = f"""
    Item: {item_name}
    Description: {description}
    Vendor Category: {vendor_category}
    Shopping Category: {shopping_category}
1
    Task: Choose the best suited shopping subcategory for this item.

    Allowed subcategories for {shopping_category}:
    {subcategory_list}

    Rules:
    - Return the subcategory name exactly as written, followed by your confidence
    - Format: subcategory_name|confidence:XX% (where XX is 0-100)
    - Example: casual wear|confidence:95%
    - If none fit, return empty string|confidence:0%
    - Do not add quotes, extra text, or explanations
    """
    result = run_model("phi4", text)
    result = result.lower().replace("'", "").replace('"', "").replace(":", "").strip()
    
    # Parse subcategory and confidence
    if "|confidence" in result:
        parts = result.split("|confidence")
        subcategory = parts[0].strip()
        confidence = parts[1].strip().replace("%", "").strip()
        try:
            confidence = int(confidence)
        except:
            confidence = 0
    else:
        subcategory = result.strip()
        confidence = 0
    
    # Validate subcategory
    if subcategory not in subcategory_list:
        subcategory = ""
        confidence = 0
    
    return subcategory, confidence

In [12]:
# Apply level 2 classification to the whole DataFrame
df[["shoppingSubcategory", "subcategory_confidence"]] = df.apply(lambda row: pd.Series(classify_shopping_subcategory(row["shoppingCategory"],row["Item (EN)"],row["Description (EN)"],row["Category/Department (EN)"])),axis=1)

In [13]:
df

Unnamed: 0,Item (EN),Description (EN),Category/Department (EN),Variant Name,shoppingCategory,confidence,shoppingSubcategory,subcategory_confidence
0,Albergo Stripe Set - Off White,A summer uniform reimagined in breathable Egyp...,Poplin Sets,XS/XS,fashion,0,,0
1,Yemin W Shemal Socks,Crafted from a soft and durable blend of 80% c...,Socks,,fashion,0,footwear,0


## 7. Level 3: Item Category Classification (with Confidence)

In [14]:
def classify_item_category(shopping_category, shopping_subcategory, item_name, description, vendor_category):
    """Classify item into item category (Level 3) based on shopping category and subcategory"""
    
    # If shopping_category or shopping_subcategory is empty or not in map, return empty
    if shopping_category == "" or shopping_subcategory == "":
        return "", 0
    
    if shopping_category not in itemCategory_map:
        return "", 0
    
    if shopping_subcategory not in itemCategory_map[shopping_category]:
        return "", 0
    
    # Get the appropriate item category list
    item_category_list = itemCategory_map[shopping_category][shopping_subcategory]
    
    text = f"""
    Item: {item_name}
    Description: {description}
    Vendor Category: {vendor_category}
    Shopping Category: {shopping_category}
    Shopping Subcategory: {shopping_subcategory}

    Task: Choose the best suited item category for this item.

    Allowed item categories for {shopping_category} > {shopping_subcategory}:
    {item_category_list}

    Rules:
    - Return the item category name exactly as written, followed by your confidence
    - Format: category_name|confidence:XX% (where XX is 0-100)
    - Example: t-shirt|confidence:95%
    - If none fit, return empty|confidence:0%
    - Do not add quotes, extra text, or explanations
    """
    result = run_model("phi4", text)
    result = result.lower().replace("'", "").replace('"', "").replace(":", "").strip()
    
    # Parse item category and confidence
    if "|confidence" in result:
        parts = result.split("|confidence")
        item_category = parts[0].strip()
        confidence = parts[1].strip().replace("%", "").strip()
        try:
            confidence = int(confidence)
        except:
            confidence = 0
    else:
        item_category = result.strip()
        confidence = 0
    
    # Validate item category
    if item_category not in item_category_list:
        item_category = ""
        confidence = 0
    
    return item_category, confidence

In [15]:
# Apply level 3 classification to the whole DataFrame
df[["itemCategory", "itemCategory_confidence"]] = df.apply(lambda row: pd.Series(classify_item_category(row["shoppingCategory"],row["shoppingSubcategory"],row["Item (EN)"],row["Description (EN)"],row["Category/Department (EN)"])),axis=1)

In [16]:
df

Unnamed: 0,Item (EN),Description (EN),Category/Department (EN),Variant Name,shoppingCategory,confidence,shoppingSubcategory,subcategory_confidence,itemCategory,itemCategory_confidence
0,Albergo Stripe Set - Off White,A summer uniform reimagined in breathable Egyp...,Poplin Sets,XS/XS,fashion,0,,0,,0
1,Yemin W Shemal Socks,Crafted from a soft and durable blend of 80% c...,Socks,,fashion,0,footwear,0,sock,100


## 8. Level 4: Item Subcategory Classification (with Confidence)

In [17]:
def classify_item_subcategory(shopping_category, shopping_subcategory, item_category, item_name, description, vendor_category):
    """Classify item into item subcategory (Level 4) based on shopping category and item category"""

    # If any previous level is empty, return empty
    if shopping_category == "" or shopping_subcategory == "" or item_category == "":
        return "", 0

    if shopping_category not in itemSubcategory_map:
        return "", 0

    if item_category not in itemSubcategory_map[shopping_category]:
        return "", 0

    # Get the appropriate item subcategory list
    item_subcategory_list = itemSubcategory_map[shopping_category][item_category]

    text = f"""
You are a product classification assistant. Analyze the following item and classify it into the most appropriate subcategory.

Item Name: {item_name}
Description: {description}
Vendor Category: {vendor_category}
Current Classification Path:
- Shopping Category: {shopping_category}
- Shopping Subcategory: {shopping_subcategory}
- Item Category: {item_category}

TASK: Select ONE subcategory from the allowed list below that BEST matches this item.

ALLOWED SUBCATEGORIES (choose exactly one):
{item_subcategory_list}

IMPORTANT INSTRUCTIONS:
1. You MUST choose one option from the allowed list above
2. Focus on the item name and description to make your choice
3. Return ONLY the subcategory name followed by |confidence:XX%
4. The subcategory name must match EXACTLY as shown in the list (lowercase, no quotes)
5. Confidence should be between 0-100
6. ONLY return empty|confidence:0% if absolutely no option fits

OUTPUT FORMAT (choose one):
subcategory_name|confidence:XX%

Example: sweatshirt|confidence:90%

Your answer:"""
    result = run_model("phi4", text)
    result = result.lower().replace("'", "").replace('"', "").replace(":", "").strip()

    # Parse item subcategory and confidence
    if "|confidence" in result:
        parts = result.split("|confidence")
        item_subcategory = parts[0].strip()
        confidence = parts[1].strip().replace("%", "").strip()
        try:
            confidence = int(confidence)
        except:
            confidence = 0
    else:
        item_subcategory = result.strip()
        confidence = 0

    # Validate item subcategory
    if item_subcategory not in item_subcategory_list:
        item_subcategory = ""
        confidence = 0

    return item_subcategory, confidence

In [18]:
# Apply level 4 classification to the whole DataFrame
df[["itemSubcategory", "itemSubcategory_confidence"]] = df.apply(lambda row: pd.Series(
    classify_item_subcategory(row["shoppingCategory"],row["shoppingSubcategory"],
                              row["itemCategory"],row["Item (EN)"],row["Description (EN)"],
                              row["Category/Department (EN)"])),axis=1)

In [19]:
df

Unnamed: 0,Item (EN),Description (EN),Category/Department (EN),Variant Name,shoppingCategory,confidence,shoppingSubcategory,subcategory_confidence,itemCategory,itemCategory_confidence,itemSubcategory,itemSubcategory_confidence
0,Albergo Stripe Set - Off White,A summer uniform reimagined in breathable Egyp...,Poplin Sets,XS/XS,fashion,0,,0,,0,,0
1,Yemin W Shemal Socks,Crafted from a soft and durable blend of 80% c...,Socks,,fashion,0,footwear,0,sock,100,,0


## 9. SKW - Search Keywords Generation

In [20]:
def generate_skw(item_name, description, vendor_category):
    """Generate Search Keywords (SKW) for the item"""
    
    # Create input text from available data
    input_text = f"Item: {item_name}\nDescription: {description}\nCategory: {vendor_category}"
    
    prompt = f"""Generate 5 shopping keyword phrases for this item.

Item data: {input_text}

Rules:
- Do not use numbering or extra text, only list the phrases separated by commas
- The first phrase must be only the product category (example: "Ring", "Necklace", "Shirt")
- All other phrases must end with the product category word
- Each phrase must be maximum 3 words only. Never exceed 3 words
- Format = modifier + modifier + product category
- Modifiers = tangible features or proper nouns
- Remove sentiments, numbers, dates, symbols
- Return phrases in lowercase
- Output format: phrase1, phrase2, phrase3, phrase4, phrase5

Your answer:"""
    
    result = run_model("phi4", prompt)
    result = result.strip().lower()
    
    return result

In [21]:
# Apply SKW generation to the whole DataFrame
df["SKW"] = df.apply(lambda row: generate_skw(row["Item (EN)"], row["Description (EN)"], row["Category/Department (EN)"]), axis=1)

In [22]:
df[["Item (EN)", "SKW"]]

Unnamed: 0,Item (EN),SKW
0,Albergo Stripe Set - Off White,"poplin sets, breathable summer uniform, egypti..."
1,Yemin W Shemal Socks,"socks, cotton blend socks, elastane stretchy s..."


## 10. DSW - Description Search Words Generation

In [23]:
def generate_dsw(item_name, description, vendor_category):
    """Generate Description Search Words (DSW) for the item"""
    
    # Create input text from available data
    input_text = f"Item: {item_name}\nDescription: {description}\nCategory: {vendor_category}"
    
    prompt = f"""Generate 5-10 shopping keyword phrases for this item.

Item data: {input_text}

Rules:
- Do not use numbering, bullets, or extra comments, only list the phrases separated by commas
- Each phrase must end with the main product word
- Format (≤3 words) = modifier + modifier + main product
- Modifiers = tangible features, functional attributes, or proper nouns
- Include exactly one phrase with only the main product (example: "Ring", "Shirt")
- Remove sentiments, opinions, numbers, dates, symbols, and abbreviations
- Return phrases in lowercase
- Output format: phrase1, phrase2, phrase3, ...

Your answer:"""
    
    result = run_model("phi4", prompt)
    result = result.strip().lower()
    
    return result

In [24]:
# Apply DSW generation to the whole DataFrame
df["DSW"] = df.apply(lambda row: generate_dsw(row["Item (EN)"], row["Description (EN)"], row["Category/Department (EN)"]), axis=1)

In [25]:
df[["Item (EN)", "DSW"]]

Unnamed: 0,Item (EN),DSW
0,Albergo Stripe Set - Off White,"european summer albergostripe set, breathable ..."
1,Yemin W Shemal Socks,"stretchy cotton blend socks, elastane-infused ..."


## 11. AI-Attributes Extraction

In [26]:
def extract_ai_attributes(item_name, description, vendor_category, shopping_category, shopping_subcategory, item_category):
    """Extract AI Attributes for the item"""
    
    # Create input text from available data
    input_text = f"""Item Name: {item_name}
Description: {description}
Vendor Category: {vendor_category}
Shopping Category: {shopping_category}
Shopping Subcategory: {shopping_subcategory}
Item Category: {item_category}"""
    
    prompt = f"""Extract all relevant attributes for this item based on the data provided.

{input_text}

INSTRUCTIONS:
- Identify all attributes of this item
- Only fill attributes that can be clearly inferred from the data
- Leave unknowns empty (do not guess or use placeholders like "N/A" or "Unknown")
- Do not include extra statements, explanations, or brackets
- Use short, precise values in English
- For Gender, choose STRICTLY from this list ONLY:
  ["Women", "Men", "Unisex women, Unisex men", "Girls", "Boys", "Unisex girls, unisex boys"]
  Do not invent or modify values (e.g., "Unisex" alone is invalid)
- Generic Name should use the item category if possible
- Color should be inferred from the item name or description if present
- Product Name should be concise and represent the product, not copied verbatim

OUTPUT FORMAT (fill only what you know, leave rest empty):

Gender: 
Age: 
Brand: 
Generic Name: 
Product Name: 
Size: 
Measurements: 
Features: 
Types of Fashion Styles: 
Gem Stones: 
Birth Stones: 
Material: 
Color: 
Pattern: 
Occasion: 
Activity: 
Season: 
Country of origin: 

Your answer:"""
    
    result = run_model("phi4", prompt)
    result = result.strip()
    
    return result

In [27]:
# Apply AI-Attributes extraction to the whole DataFrame
df["AI_Attributes"] = df.apply(lambda row: extract_ai_attributes(
    row["Item (EN)"], 
    row["Description (EN)"], 
    row["Category/Department (EN)"],
    row["shoppingCategory"],
    row["shoppingSubcategory"],
    row["itemCategory"]
), axis=1)

In [28]:
# Display first row's AI attributes
print(df.loc[0, "AI_Attributes"])
print("\n" + "="*80 + "\n")
# Display full dataframe
df

Gender: Unisex women, Unisex men  
Age:  
Brand: Albergo  
Generic Name: Poplin Set  
Product Name: Albergo Stripe Off White Set  
Size:  
Measurements:  
Features: Breathable Egyptian cotton poplin, short-sleeved shirt with spread collar and single chest pocket, tailored shorts with elastic waistband, drawstring, discreet side pockets, single back pocket, relaxed fit  
Types of Fashion Styles:  
Gem Stones:  
Birth Stones:  
Material: Egyptian cotton poplin  
Color: Off White  
Pattern: Stripe  
Occasion:  
Activity:  
Season: Summer  
Country of origin: Egypt




Unnamed: 0,Item (EN),Description (EN),Category/Department (EN),Variant Name,shoppingCategory,confidence,shoppingSubcategory,subcategory_confidence,itemCategory,itemCategory_confidence,itemSubcategory,itemSubcategory_confidence,SKW,DSW,AI_Attributes
0,Albergo Stripe Set - Off White,A summer uniform reimagined in breathable Egyp...,Poplin Sets,XS/XS,fashion,0,,0,,0,,0,"poplin sets, breathable summer uniform, egypti...","european summer albergostripe set, breathable ...","Gender: Unisex women, Unisex men \nAge: \nBr..."
1,Yemin W Shemal Socks,Crafted from a soft and durable blend of 80% c...,Socks,,fashion,0,footwear,0,sock,100,,0,"socks, cotton blend socks, elastane stretchy s...","stretchy cotton blend socks, elastane-infused ...","Gender: Unisex women, Unisex men \nAge: \nB..."


## 12. Arabic Translation (EN to AR)

In [None]:
import pandas as pd
import requests

API_URL = "http://100.75.237.4:11434/api/generate"

MODELS = {
    "aya": ("aya:8b", API_URL, "response", None),
}

def run_model(model_name, prompt):
    model, url, key, subkey = MODELS[model_name]
    payload = {"model": model, "prompt": prompt, "max_tokens": 200, "stream": False}
    r = requests.post(url, json=payload)
    r.raise_for_status()
    data = r.json()
    return data[key][0][subkey].strip() if subkey else data[key].strip()


def translate_to_arabic(text):
    """Translate English text to Arabic using Aya model"""
    if pd.isna(text) or text == "" or text == "empty":
        return ""

    if isinstance(text, list):
        text = ' '.join(str(x) for x in text)
    else:
        text = str(text)

    try:
        system_prompt = (
            "You are an English to Arabic translator for an E-commerce site. "
            "Translate the following title, description, or category into Arabic. "
            "Respond only with the translated text, nothing else.\n\n"
        )
        prompt = f"{system_prompt}{text}"
        translated_text = run_model("aya", prompt)
        return translated_text.strip()
    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return ""
    
    
# Example usage
print(translate_to_arabic("Hello, how are you?"))



مرحباً، كيف حالك؟


In [None]:
# Apply translation function to each of the target columns
target_columns = ["Item (EN)", "Description (EN)", "Category/Department (EN)"]

for col in target_columns:
    arabic_col = col.replace("(EN)", "(AR)").strip()  # e.g. "Item (AR)"
    print(f"Translating column: {col} → {arabic_col}")
    df[arabic_col] = df[col].apply(translate_to_arabic)

# Optional: display results
print(df[[*target_columns, *(col.replace("(EN)", "(AR)").strip() for col in target_columns)]].head())


In [37]:
df[['Item (EN)','Item (AR)', 'Description (EN)', 'Description (AR)', 'Category/Department (EN)','Category/Department (AR)']]

Unnamed: 0,Item (EN),Item (AR),Description (EN),Description (AR),Category/Department (EN),Category/Department (AR)
0,Albergo Stripe Set - Off White,مجموعة ألبرجو ستريب - كريم فاتح,A summer uniform reimagined in breathable Egyp...,موحد صيفي بإعادة تصميم مصنوع من القطن المصري ا...,Poplin Sets,مجموعة بوبلين
1,Yemin W Shemal Socks,"جوارب ""يمن ويشمال""",Crafted from a soft and durable blend of 80% c...,صنعت هذه الجوارب من مزيج ناعم ودائم من القطن ب...,Socks,الجوارب


In [38]:
df

Unnamed: 0,Item (EN),Description (EN),Category/Department (EN),Variant Name,shoppingCategory,confidence,shoppingSubcategory,subcategory_confidence,itemCategory,itemCategory_confidence,itemSubcategory,itemSubcategory_confidence,SKW,DSW,AI_Attributes,Item (AR),Description (AR),Category/Department (AR)
0,Albergo Stripe Set - Off White,A summer uniform reimagined in breathable Egyp...,Poplin Sets,XS/XS,fashion,0,,0,,0,,0,"poplin sets, breathable summer uniform, egypti...","european summer albergostripe set, breathable ...","Gender: Unisex women, Unisex men \nAge: \nBr...",مجموعة ألبرجو ستريب - كريم فاتح,موحد صيفي بإعادة تصميم مصنوع من القطن المصري ا...,مجموعة بوبلين
1,Yemin W Shemal Socks,Crafted from a soft and durable blend of 80% c...,Socks,,fashion,0,footwear,0,sock,100,,0,"socks, cotton blend socks, elastane stretchy s...","stretchy cotton blend socks, elastane-infused ...","Gender: Unisex women, Unisex men \nAge: \nB...","جوارب ""يمن ويشمال""",صنعت هذه الجوارب من مزيج ناعم ودائم من القطن ب...,الجوارب


### Save Results to CSV

In [None]:
# Save enriched data to CSV
# output_file = "enriched_results.csv"
# df.to_csv(output_file, index=False, encoding='utf-8-sig')
# print(f"Results saved to {output_file}")