# Objectif du Notebook :

- Récupérer les données nécessaires et traiter le dataframe
- Importer le modèle Bert et l'extraire sous forme de Pickle
- Encoder la colonne description du dataframe à l'aide du modèle importé

## Données nécessaires :

- captions.json : https://drive.google.com/file/d/1d1TRm8UMcQhZCb6HpPo8l3OPEin4Ztk2/view?usp=sharing

### source : 

- https://github.com/yumingj/DeepFashion-MultiModal
- https://huggingface.co/docs/transformers/model_doc/bert

------------

### Install
-----------

In [None]:
! pip install scikit-learn
! pip install sentence-transformers
! pip install protobuf

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 k

### Import
--------------

In [None]:
import pandas as pd
import json
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pickle

### Model
------------

In [None]:
#On charge le modèle large de Bert
model =  SentenceTransformer("bert-large-uncased")

Downloading (…)10431/.gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading (…)a37c210431/README.md:   0%|          | 0.00/8.96k [00:00<?, ?B/s]

Downloading (…)7c210431/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)10431/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)a37c210431/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)-word-masking.tar.gz:   0%|          | 0.00/1.25G [00:00<?, ?B/s]



### Pickle
----------------

In [None]:
#On exporte le modèle grâce a Pickle
with open('modele.pickle', 'wb') as fichier:
    pickle.dump(model, fichier)

### Data
------------------------

In [None]:
#Chargement de la table "captions.json" du git deepfashion-MultiModal : https://drive.google.com/file/d/1d1TRm8UMcQhZCb6HpPo8l3OPEin4Ztk2/view?usp=sharing
with open('/drive/My Drive/data_clothes/captions.json', 'r') as json_file:
    data = json.load(json_file)

In [None]:
# Créez un DataFrame
df = pd.DataFrame(data.items(), columns=['Image', 'Description'])

In [None]:
#On créer la colonne sexe
df['sexe'] = df['Image'].apply(lambda x: x[:3])

In [None]:
df

Unnamed: 0,Image,Description,sexe
0,MEN-Denim-id_00000080-01_7_additional.jpg,The lower clothing is of long length. The fabr...,MEN
1,MEN-Denim-id_00000089-01_7_additional.jpg,"His tank top has sleeves cut off, cotton fabri...",MEN
2,MEN-Denim-id_00000089-02_7_additional.jpg,"His sweater has long sleeves, cotton fabric an...",MEN
3,MEN-Denim-id_00000089-03_7_additional.jpg,"His shirt has short sleeves, cotton fabric and...",MEN
4,MEN-Denim-id_00000089-04_7_additional.jpg,"The sweater the person wears has long sleeves,...",MEN
...,...,...,...
42539,WOMEN-Tees_Tanks-id_00007979-04_4_full.jpg,The lady wears a tank tank shirt with pure col...,WOM
42540,WOMEN-Tees_Tanks-id_00007979-04_7_additional.jpg,The person wears a sleeveless tank shirt with ...,WOM
42541,WOMEN-Tees_Tanks-id_00007981-03_1_front.jpg,This woman wears a sleeveless tank top with ot...,WOM
42542,WOMEN-Tees_Tanks-id_00007981-03_3_back.jpg,The tank top the lady wears has sleeves cut of...,WOM


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
#On encode la description de l'image grâce a Bert (peut prendre plus de 1h)
df['encoding_label'] = (df['Description']).apply(lambda x : model.encode(x))

In [None]:
df


Unnamed: 0,Image,Description,sexe,encoding_label
0,WOMEN-Blouses_Shirts-id_00007222-02_3_back.jpg,The female wears a long-sleeve shirt with flor...,WOM,"[0.024525413, -0.41323054, -0.12550968, -0.055..."
1,WOMEN-Dresses-id_00000811-04_4_full.jpg,This person is wearing a short-sleeve shirt wi...,WOM,"[-0.15086243, -0.3766451, -0.092323795, -0.074..."
2,MEN-Tees_Tanks-id_00002249-13_1_front.jpg,"The upper clothing has long sleeves, cotton fa...",MEN,"[0.0729953, -0.47449693, -0.19418849, 0.024725..."
3,WOMEN-Tees_Tanks-id_00001071-10_7_additional.jpg,This lady is wearing a tank tank top with soli...,WOM,"[0.034215357, -0.6766614, -0.036036164, -0.108..."
4,MEN-Tees_Tanks-id_00001021-06_1_front.jpg,This person wears a short-sleeve shirt with co...,MEN,"[0.16191122, -0.1928788, -0.038948882, -0.2593..."
...,...,...,...,...
995,WOMEN-Tees_Tanks-id_00001043-01_1_front.jpg,This woman is wearing a long-sleeve sweater wi...,WOM,"[-0.19599353, -0.35992762, -0.096293814, -0.12..."
996,WOMEN-Sweaters-id_00007927-04_7_additional.jpg,"The sweater the person wears has long sleeves,...",WOM,"[-0.15251428, -0.28481779, 0.11188296, -0.0721..."
997,WOMEN-Graphic_Tees-id_00002835-01_1_front.jpg,This female is wearing a short-sleeve T-shirt ...,WOM,"[0.22766016, -0.37010336, -0.13829505, -0.1590..."
998,WOMEN-Rompers_Jumpsuits-id_00006408-03_3_back.jpg,The female is wearing a sleeveless tank top wi...,WOM,"[0.0015822111, -0.48375535, -0.1858173, 0.0397..."


In [None]:
#On extrait le dataframe sous forme de json pour le réutiliser dans le flask par la suite ! 
df.to_json('/drive/My Drive/data_clothes/df_clotes_embeded.json')