# 1. Import data 
## 1.1. Import libraries

In [None]:
!unzip downscaled_images_raw.zip
!unzip downscaled_images_structure.zip
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.image import ImageDataGenerator
from keras import applications




## 1.2. Import CSV file


In [52]:
# Load the dataset into a Pandas dataframe
df = pd.read_csv('dataset.csv')

# Make 2 copy of current dataframe for each approach
df_cb = df.copy()
df_dl = df.copy()

mini_df = df[(df['article_id'] > 290000000) & (df['article_id'] < 520000000)]

# 1.3. General Data description

In [53]:
# Check for null values
df.isnull().sum()

article_id                        0
product_code                      0
prod_name                         0
product_type_no                   0
product_type_name                 0
product_group_name                0
graphical_appearance_no           0
graphical_appearance_name         0
colour_group_code                 0
colour_group_name                 0
perceived_colour_value_id         0
perceived_colour_value_name       0
perceived_colour_master_id        0
perceived_colour_master_name      0
department_no                     0
department_name                   0
index_code                        0
index_name                        0
index_group_no                    0
index_group_name                  0
section_no                        0
section_name                      0
garment_group_no                  0
garment_group_name                0
detail_desc                     416
dtype: int64

In [54]:
# Check for dataframe columns type
mini_df.dtypes

article_id                       int64
product_code                     int64
prod_name                       object
product_type_no                  int64
product_type_name               object
product_group_name              object
graphical_appearance_no          int64
graphical_appearance_name       object
colour_group_code                int64
colour_group_name               object
perceived_colour_value_id        int64
perceived_colour_value_name     object
perceived_colour_master_id       int64
perceived_colour_master_name    object
department_no                    int64
department_name                 object
index_code                      object
index_name                      object
index_group_no                   int64
index_group_name                object
section_no                       int64
section_name                    object
garment_group_no                 int64
garment_group_name              object
detail_desc                     object
dtype: object

In [55]:
mini_df["index_group_name"].replace({"Baby/Children": "Children"}, inplace=True)
mini_df.value_counts("index_group_name")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_df["index_group_name"].replace({"Baby/Children": "Children"}, inplace=True)


index_group_name
Children      2281
Ladieswear    2128
Menswear      1379
Divided       1217
Sport          400
dtype: int64

In [56]:
df_cb.describe()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no
count,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0,105542.0
mean,698424600.0,698424.563378,234.861875,1009515.0,32.233822,3.206183,7.807972,4532.777833,3.171534,42.664219,1010.43829
std,128462400.0,128462.384432,75.049308,22413.59,28.086154,1.563839,5.376727,2712.692011,4.353234,23.260105,6.731023
min,108775000.0,108775.0,-1.0,-1.0,-1.0,-1.0,-1.0,1201.0,1.0,2.0,1001.0
25%,616992500.0,616992.5,252.0,1010008.0,9.0,2.0,4.0,1676.0,1.0,20.0,1005.0
50%,702213000.0,702213.0,259.0,1010016.0,14.0,4.0,5.0,4222.0,2.0,46.0,1009.0
75%,796703000.0,796703.0,272.0,1010016.0,52.0,4.0,11.0,7389.0,4.0,61.0,1017.0
max,959461000.0,959461.0,762.0,1010029.0,93.0,7.0,20.0,9989.0,26.0,97.0,1025.0


This table contains all h&m articles with details such as a type of product, a color, a product group and other features.
Article data description:

- __article_id__ : A unique identifier of every article.
- __product_code__, __prod_name__ : A unique identifier of every product and its name (not the same).
- __product_type__, __product_type_name__ : The group of product_code and its name
- __graphical_appearance_no__, __graphical_appearance_name__ : The group of graphics and its name
- __colour_group_code__, __colour_group_name__ : The group of color and its name
- __perceived_colour_value_id__, __perceived_colour_value_name__, __perceived_colour_master_id__, __perceived_colour_master_name__ : The added color info
- __department_no__, __department_name__: : A unique identifier of every dep and its name
- __index_code__, __index_name__: : A unique identifier of every index and its name
- __index_group_no__, __index_group_name__: : A group of indeces and its name
- __section_no__, __section_name__: : A unique identifier of every section and its name
- __garment_group_no__, __garment_group_name__: : A unique identifier of every garment and its name
- __detail_desc__: : Details

# 2. Data Cleaning

## Resize and structure images into correct folder

In [61]:
# image downscale
import os
import cv2

# Set the input directory
input_dir = 'downscaled_images_raw/'

# Set the output directory
output_dir = 'downscaled_images_structure/'

# Get a list of the images in the input directory
image_folders = os.listdir(input_dir)
print

# Iterate over the images
for folder in image_folders:
    # Load the image
    image_filenames = os.listdir(input_dir+folder)
    for filename in image_filenames:
        image = cv2.imread(input_dir + folder + "/" + filename)
        
        folder_name = mini_df[df['article_id'] == int(
            filename[0:10])]["index_group_name"].values[0]
        # Downscale the image using linear interpolation
        downscaled_image = cv2.resize(
            image, (125, 125), interpolation=cv2.INTER_LINEAR)

        # Save the downscaled image to a file
        cv2.imwrite(output_dir+folder+"/" + filename, downscaled_image)
    


# 3. Recommender systems

## Approach 1: Content-based

### Feature Engineeering

In [58]:
# Define the ResNet50 model


import datetime


img_width, img_height = 25, 25

# top_model_weights_path = 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
train_data_dir = "downscaled_images/"

nb_train_samples = 1253
epochs = 50
batch_size = 1


def extract_features():
    Itemcodes = []
    model = applications.ResNet50(
        include_top=False, weights='imagenet')
    datagen = ImageDataGenerator(rescale=1. / 255)
    generator = datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)
    for i in generator.filenames:
        Itemcodes.append(i[(i.find("/")+1):i.find(".")])
    extracted_features = model.predict(
        generator, nb_train_samples // batch_size)

    np.save(open('./ResNet_features.npy', 'wb'), extracted_features)
    np.save(open('./ResNet_feature_product_ids.npy', 'wb'),
            np.array(Itemcodes))


a = datetime.now()
# extract_features()


**Approach 2: Deep Learning**

### Encoding categorical value

### Embedding

In [59]:
article_ids = df["article_id"].tolist()

tokenizer = Tokenizer(num_words=vocab_size)
sequences = tokenizer.texts_to_sequences(prod_names)

# Create an embedding matrix with random weights
embedding_matrix = np.random.rand(vocab_size, embedding_dim)

# Define the model
model = Sequential()

# Add an embedding layer with the specified weights
prod_name_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix])(sequences)

NameError: name 'vocab_size' is not defined