<a href="https://colab.research.google.com/github/Naomie25/Hackaton-Fashion-Description-Generator/blob/main/Fashion_Description_Generator_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.Define the Task & Pipeline Overview

Input (keyword or image) → Generation Model → Quality-Check Module → (Optional) Image Generator → Ethical Filter → Final Output

In [1]:
!pip install transformers torch sentencepiece
!pip install schedule
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.7.0
    Uninstalling fsspec-2025.7.0:
      Successfully uninstalled fsspec-2025.7.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
import difflib
import re
import random
import schedule
import time

# ============================
# 1. Define the Task & Pipeline Overview
# ============================
# Objective: Generate fashion product descriptions, assess their quality via summarization, apply ethical filtering, and output the final results.
# Workflow : prompt → génération texte → résumé (qualité) → filtre éthique → sortie finale.
#Input (keyword or image) → Generation Model → Quality-Check Module → (Optional) Image Generator → Ethical Filter → Final Output

In [3]:
device = torch.device("cpu")  # # Force the use of the CPU
print("Device set to use", device)

# ============================
# 2. Select Your Generation Method
# ============================
# On choisit un modèle Transformer léger pour la génération (distilgpt2)
# et un modèle BART-base pour le summary

# ============================
# 3. Pick Specific Pre-trained Models
# ============================
# Chargement du tokenizer et modèle GPT2 distillé (petit et rapide)
gpt2_model_name = "distilgpt2"
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name) # loading the tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_model_name).to(device) #loading the pretrained model

# Chargement du tokenizer et modèle BART pour resumer (qualité)
bart_model_name = "facebook/bart-base"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name).to(device)

# Liste de mots clés liés à la mode pour scorer la qualité
fashion_keywords = [
    "elegant", "stylish", "refined", "modern", "vintage", "casual",
    "minimalist", "chic", "versatile", "comfort", "premium", "crafted",
    "tailored", "cut", "fit", "fabric", "soft", "bold", "timeless"
]

# ============================
# 4. Prepare & Subsample Your Dataset
# ============================

def load_and_subsample_dataset(subsample_ratio=0.05, seed=42):
    dataset = load_dataset("imdb", split="train")
    random.seed(seed)
    sample_size = int(len(dataset) * subsample_ratio)
    indices = random.sample(range(len(dataset)), sample_size)
    subsampled_dataset = dataset.select(indices)
    print(f"Original size: {len(dataset)}, subsampled size: {len(subsampled_dataset)}")
    return subsampled_dataset

# ============================
# 5. Module génération texte
# ============================
def generate_descriptions(keyword, num_variants=5):

    #Génère plusieurs descriptions à partir d'un mot-clé prompté.

    prompt = f"Write a stylish, concise, and elegant product description focusing on fabric, cut, and style for: {keyword}.\n\n"
    input_ids = gpt2_tokenizer.encode(prompt, return_tensors="pt").to(device)

    outputs = gpt2_model.generate(
        input_ids,
        max_new_tokens=50,
        do_sample=True,
        top_k=40,
        top_p=0.9,
        temperature=0.7,
        num_return_sequences=num_variants,
        repetition_penalty=1.2,
        pad_token_id=gpt2_tokenizer.eos_token_id
    )

    results = []
    for output in outputs:
        decoded = gpt2_tokenizer.decode(output, skip_special_tokens=True)
        gen_text = decoded[len(prompt):].strip()  # Enlever le prompt initial du texte généré
        score = score_description(gen_text, prompt)
        results.append((gen_text, score))

    results = sorted(results, key=lambda x: x[1], reverse=True)  # Trier par score décroissant
    results = clean_descriptions(results)
    return results

# ============================
# Fonction utilitaire : détection répétitions
# ============================
def has_repetitions(text, max_repeat=3):
    #Detecte si un mot est répété plus de max_repeat fois consécutivement dans le texte.

    pattern = r'\b(\w+)( \1){' + str(max_repeat) + ',}\b'
    return re.search(pattern, text.lower()) is not None

# ============================
# Filtrer descriptions lisibles
# ============================
def clean_descriptions(descriptions):

    #Garde uniquement les descriptions avec suffisamment de mots et sans répétitions abusives.
    filtered = []
    for desc, score in descriptions:
        if len(desc.split()) < 8:
            continue  # Trop court = filtré
        if has_repetitions(desc):
            continue  # Répétitions trop fréquentes = filtré
        filtered.append((desc, score))
    return filtered

# ============================
# Scoring description
# ============================
def score_description(desc, prompt):
    """
    Scorer la description générée en fonction:
    - longueur (max 50 mots)
    - présence de mots clés mode
    - pénalité si trop proche du prompt (texte copié)
    """
    words = desc.lower().split()
    keyword_bonus = sum(word in words for word in fashion_keywords)
    length_score = min(len(words), 50) / 50

    similarity = difflib.SequenceMatcher(None, desc.lower(), prompt.lower()).ratio()
    penalty = max(0, 1 - similarity)  # plus on est différent, mieux c'est

    return length_score + 0.5 * keyword_bonus + penalty

Device set to use cpu


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]