# Loading libraries and data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import cv2
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from os import listdir
from os.path import isfile, join

from termcolor import colored
from IPython.display import HTML

import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

In [None]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
#images_dir = '../input/h-and-m-personalized-fashion-recommendations/images'
#cat_images = [f for f in listdir(images_dir)]

# Basic Information on datasets

In [None]:
print(f"Number of observations in ARTICLES: {colored(articles.shape, 'yellow')}")
print(f"Number of observations in CUSTOMERS: {colored(customers.shape, 'yellow')}")
print(f"Number of observations in TRANSACTIONS: {colored(transactions.shape, 'yellow')}")

In [None]:
# This code was borrowed from https://www.kaggle.com/ishandutta/v7-shopee-indepth-eda-one-stop-for-all-your-needs
def getImagePaths(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

def display_multiple_img(images_paths, rows, cols):
    """
    Function to Display Images from Dataset.
    
    parameters: images_path(string) - Paths of Images to be displayed
                rows(int) - No. of Rows in Output
                cols(int) - No. of Columns in Output
    """
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

def plot_distribution(x, data, title):
        fig = px.histogram(
        data, 
        x = x,
        width = 800,
        height = 500,
        title = title
        )

        fig.show()
def disply_multiple_img_ids(idx, rows, cols):
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8))
    for ind, im_id in enumerate(idx):

        image_path = f'{images_dir}/0{str(im_id)[:2]}/0{im_id}.jpg'
        
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()


In [None]:
#images_path = getImagePaths(images_dir)

In [None]:
#print(f"There are {colored(len(images_path), 'yellow')} number of images in dataset")

In [None]:
#display_multiple_img(images_path[50:100], 5, 5)

# QUICK LOOK INTO DATA

## A. ARTICLES

**<span style="color:#023e8a;"> This table contains all h&m articles with details such as a type of product, a color, a product group and other features.</span>**  
**<span style="color:#023e8a;"> Article data description: </span>**
  
- 105542 rows and 25 columns  
- No nulls apart from detail_desc  
- 11 int and 14 obj types are present  
> `article_id` **<span style="color:#023e8a;">: A unique identifier of every article.</span>**  
>  - The primary column  
  
> `product_code`, `prod_name` **<span style="color:#023e8a;">: A unique identifier of every product and its name (not the same).</span>** >  - 47224 unique product_code  
>  - product_code and article id are highly correlated  
>  - 45875 uniqe prod_name. **Different from product_code**. Which one to use?  
>  - No specific dominant levels in prod_name  
  
> `product_type`, `product_type_name` **<span style="color:#023e8a;">: The group of product_code and its name</span>**  
>  - 132 unique product types, but 131 unique product names  
>  - dominant levels are present in both. first 8 form ~80% of total data  
  
> 'product_group_name' : 19 unique values. highly dominant levels are present.  
  
> `graphical_appearance_no`, `graphical_appearance_name` **<span style="color:#023e8a;">: The group of graphics and its name</span>**  
>  - both has 30 unique values. 1-1 mapping  
>  - highly dominant levels present  
  
> `colour_group_code`, `colour_group_name` **<span style="color:#023e8a;">: The group of color and its name</span>**  
>  - both 50 unique values. 1-1 mapping  
>  - mildly dominant levels  
  
> `perceived_colour_value_id`, `perceived_colour_value_name`, `perceived_colour_master_id`, `perceived_colour_master_name` **<span style="color:#023e8a;">: The added color info</span>**    
>  - only 8 levels in both
  
> `department_no`, `department_name`: **<span style="color:#023e8a;">: A unique identifier of every dep and its name</span>**  
>  - 299 unique department_no, 250 unique department_name. **Not matching**  
>  - no dominant levels.   
  
> `index_code`, `index_name`: **<span style="color:#023e8a;">: A unique identifier of every index and its name</span>**  
>  - 10 levels in both 1-1 mapping.  
>  - obviously dominant  
  
> `index_group_no`, `index_group_name`: **<span style="color:#023e8a;">: A group of indeces and its name</span>**  
>  - 5 levels in both  
>  - obviously dominant  
  
> `section_no`, `section_name`: **<span style="color:#023e8a;">: A unique identifier of every section and its name</span>**  
>  - 57 in section no, 56 in section name . **Not one-one matching**  
>  - Non dominant  
  
> `garment_group_no`, `garment_group_name`: **<span style="color:#023e8a;">: A unique identifier of every garment and its name</span>**  
>  - 21 in both levels. 1-1 mapping  
>  - some dominant levels  
  
> `detail_desc`: **<span style="color:#023e8a;">: Details</span>**  
>  - All unique descriptions. many are nulls. Not sure how helpful it'll be

In [None]:
articles.shape

In [None]:
articles.info()

In [None]:
articles.describe()

In [None]:
articles.isnull().sum()

detail_desc is the only column with nulls

In [None]:
articles.nunique()

In [None]:
corr = articles.corr()
plt.figure(figsize=(16,6))
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,cmap='Greens',annot=True)

index_group_no and section_no has correlation with department_no
garment_group_no has correlation with sectoin_no
rest all seems independent

## Subset only columns of interest

In [None]:
articles_sub = articles[['article_id','prod_name','product_type_name','product_group_name','graphical_appearance_name','colour_group_name'
                         ,'perceived_colour_value_name','perceived_colour_master_name','department_name','index_name','index_group_name'
                         ,'section_name','garment_group_name','detail_desc']]
articles_sub.shape

In [None]:
conda install -c conda-forge dython

In [None]:
from dython.nominal import theils_u

In [None]:
corr = pd.DataFrame(columns = ['columns'] + list(articles_sub.columns))
corr['columns'] = articles_sub.columns

for i in range(1,corr.shape[1]):
    for j in range(corr.shape[0]):
        x = corr.columns[i]
        y = corr.iloc[j,0]
        corr.iloc[j,i] = theils_u(articles_sub[x],articles_sub[y])
        


In [None]:
corr

In [None]:
articles_sub.dtypes

In [None]:
articles_sub.columns

## Create non-overlapping information from all columns

In [None]:
import re

s = re.sub('[^0-9a-zA-Z]+', '*', s)
articles_sub.replace(to_replace ='[nN]ew', value = 'New_', regex = True)

In [None]:
cols = ['prod_name', 'product_type_name', 'product_group_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'index_group_name', 'section_name',
       'garment_group_name', 'detail_desc']
articles_sub['combined'] = articles_sub[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [None]:
articles_sub.head()

### Understanding each variable

In [None]:
def distribution_plot(data,variable):
    data = data[[variable]]
    
    figure,axis = plt.subplots(1,1,figsize=(10,5))
    s = data.value_counts()
    s_len = s/len(data.index)*100
    
    res = pd.concat([s, s_len], axis=1).reset_index().set_axis(['LEVELS','COUNT', 'PERCENTAGE'], axis=1, inplace=False).sort_values('PERCENTAGE',ascending = False)
    res.drop('COUNT',axis=1,inplace=True)
    if res.shape[0] >=9:
        res.iloc[8] = res.iloc[8:].sum()
        res = res.iloc[:9]
        res.iat[8,0] = 'Others'
    else:
        None
    
    res.plot.bar(x='LEVELS',ax=axis)
    axis.set_title(variable)
    plt.show()

In [None]:
for i in articles.columns:
    distribution_plot(articles,i)

# Customers Dataset

**<span style="color:#023e8a;"> Customers data description: </span>**

> `customer_id` **<span style="color:#023e8a;">: A unique identifier of every customer</span>**  
> `FN` **<span style="color:#023e8a;">: 1 or missed </span>**  
> `Active` **<span style="color:#023e8a;">: 1 or missed</span>**  
> `club_member_status` **<span style="color:#023e8a;">: Status in club</span>**  
> `fashion_news_frequency` **<span style="color:#023e8a;">: How often H&M may send news to customer</span>**  
> `age` **<span style="color:#023e8a;">: The current age</span>**  
> `postal_code` **<span style="color:#023e8a;">: Postal code of customer</span>**  

In [None]:
customers.columns

In [None]:
customers.head(5)

In [None]:
customers.shape

In [None]:
customers.info()

In [None]:
customers.describe()

In [None]:
customers.isnull().sum()

In [None]:
customers.nunique()

In [None]:
corr = customers.corr()
plt.figure(figsize=(16,6))
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,cmap='Greens',annot=True)

In [None]:
def distribution_plot(data,variable):
    data = data[[variable]]
    
    figure,axis = plt.subplots(1,1,figsize=(10,5))
    s = data.value_counts()
    s_len = s/len(data.index)*100
    
    res = pd.concat([s, s_len], axis=1).reset_index().set_axis(['LEVELS','COUNT', 'PERCENTAGE'], axis=1, inplace=False).sort_values('PERCENTAGE',ascending = False)
    res.drop('COUNT',axis=1,inplace=True)
    if res.shape[0] >=9:
        res.iloc[8] = res.iloc[8:].sum()
        res = res.iloc[:9]
        res.iat[8,0] = 'Others'
    else:
        None
    
    res.plot.bar(x='LEVELS',ax=axis)
    axis.set_title(variable)
    plt.show()

In [None]:
for i in customers.columns[0]:
    distribution_plot(customers,i)

In [None]:
customers.columns[0]

In [None]:
customers[['customer_id']]