## Imports

In [2]:
import pandas as pd

## Loading Data

Read JSON files containing features and labels for the training data and a test set.

In [3]:
train_features = pd.read_json('./datathon_phase_2_data/training_data/train.features', lines=True)
train_labels = pd.read_json('./datathon_phase_2_data/training_data/train.labels', lines=True)
test_features_set1 = pd.read_json('./datathon_phase_2_data/test_data/phase_2_test_set1.features', lines=True)

## Checking Number of Classes in Each Label
This section counts the number of unique values for each label in the training data.

In [5]:
len(train_labels['supergroup'].value_counts())

32

In [6]:
len(train_labels['group'].value_counts())

228

In [7]:
len(train_labels['module'].value_counts())

449

In [8]:
len(train_labels['brand'].value_counts())

5679

| Labels | No. of Classes |
| -------- | ------- |
| supergroup | 32 |
| group | 228 |
| module | 449 |
| brand | 5679 |

## Checking Label Relationships

This section checks if each sub-label is associated with a single parent label.

In [9]:
groups = train_labels['group'].unique()

for group in groups:
    classes = train_labels[train_labels['group']==group]['supergroup'].unique()
    n_classes = len(classes)
    if n_classes != 1:
        print(group, "has", n_classes, "supergroups which are", classes)

In [10]:
modules = train_labels['module'].unique()

for module in modules:
    classes = train_labels[train_labels['module']==module]['group'].unique()
    n_classes = len(classes)
    if n_classes != 1:
        print(module, "has", n_classes, "groups which are", classes)

In [11]:
brands = train_labels['brand'].unique()

for brand in brands:
    classes = train_labels[train_labels['brand']==brand]['module'].unique()
    n_classes = len(classes)
    if n_classes != 1:
        print(brand, "has", n_classes, "modules which are", classes)

receipt all has 16 modules which are ['automotive' 'clothing & personal accessories'
 'computing & telecommunications' 'cooking appliances & cooking ware'
 'garden & flora' 'home appliances' 'home do it yourself'
 'home entertainment' 'home furnishings & decor' 'homecare merchandise'
 'kitchen & tableware' 'optical' 'photo' 'sport & leisure'
 'stationery & printed material & services' 'toys']
huggies has 3 modules which are ['baby care' 'baby diapers' 'skin cleansing & toning']
nourify has 94 modules which are ['baby care' 'baby feeding accessories' 'baby soothing hardware'
 'baby diapers' 'breast feeding accessories'
 'flavoured drinks carbonated non cola' 'juices'
 'mineral water non carbonated' 'chocolate single variety'
 'chocolate novelties' 'sugar candy' 'biscuits sweet ambient'
 'cosmetic accessories' 'cosmetic combination packs & gift sets'
 'cosmetic removal' 'eye cosmetics' 'face cosmetics' 'fragrances cologne'
 'nail care' 'nail polish & decoration'
 'fruit & nut combination

In [12]:
brands = train_labels['brand'].unique()

for brand in brands:
    classes = train_labels[train_labels['brand']==brand]['group'].unique()
    n_classes = len(classes)
    if n_classes != 1:
        print(brand, "has", n_classes, "groups which are", classes)

receipt all has 16 groups which are ['automotive detail unknown total'
 'clothing & personal accessories detail unknown total'
 'computing & telecommunications detail unknown total'
 'cooking appliances & cooking ware detail unknown total'
 'garden & flora detail unknown total'
 'home appliances detail unknown total'
 'home do it yourself detail unknown total'
 'home entertainment detail unknown total'
 'home furnishings & decor detail unknown total'
 'homecare merchandise detail unknown total'
 'kitchen & tableware detail unknown total' 'optical detail unknown total'
 'photo detail unknown total' 'sport & leisure detail unknown total'
 'stationery & printed material & services detail unknown total'
 'toys detail unknown total']
huggies has 3 groups which are ['baby care detail unknown total' 'baby personal hygiene'
 'skin cleansing & toning']
nourify has 58 groups which are ['baby care detail unknown total' 'baby feeding accessories'
 'baby hardware' 'baby personal hygiene' 'breast fe

In [13]:
brands = train_labels['brand'].unique()

for brand in brands:
    classes = train_labels[train_labels['brand']==brand]['supergroup'].unique()
    n_classes = len(classes)
    if n_classes != 1:
        print(brand, "has", n_classes, "supergroups which are", classes)

receipt all has 16 supergroups which are ['automotive' 'clothing & personal accessories'
 'computing & telecommunications' 'cooking appliances & cooking ware'
 'garden & flora' 'home appliances' 'home do it yourself'
 'home entertainment' 'home furnishings & decor' 'homecare merchandise'
 'kitchen & tableware' 'optical' 'photo' 'sport & leisure'
 'stationery & printed material & services' 'toys']
huggies has 2 supergroups which are ['baby care' 'personal care']
nourify has 9 supergroups which are ['baby care' 'beverages non alcoholic' 'biscuits & confectionery & snacks'
 'cosmetics & fragrances' 'food ambient' 'food perishable' 'healthcare'
 'homecare' 'personal care']
pure baby has 2 supergroups which are ['baby care' 'personal care']
verdemart has 14 supergroups which are ['baby care' 'beverages alcoholic' 'beverages non alcoholic'
 'biscuits & confectionery & snacks' 'cosmetics & fragrances'
 'food ambient' 'food frozen' 'food perishable' 'healthcare' 'homecare'
 'kitchen & tablewar

## Inference about Label Relationships

1. **Supergroup**
   - Relationship: One-to-Many
   - Related Label: Group

2. **Group**
   - Relationship: One-to-Many
   - Related Label: Module

3. **Module**
   - Relationship: Many-to-Many
   - Related Label: Brand

### Summary of Relationships
- A **Supergroup** can contain multiple **Groups**.
- Each **Group** can consist of multiple **Modules**.
- **Modules** can be associated with multiple **Brands**, and each **Brand** can relate to multiple **Modules** (and with all its previous labels as well).

In [14]:
train_features

Unnamed: 0,indoml_id,description,retailer,price
0,0,1 adblue,organicorner,25.35
1,1,1 car mat set,greenharbor,4.99
2,2,1 cp rmx scrnwash,naturify,3.85
3,3,1 diesel,ecogro,4.41
4,4,1 unstoppable refrsher,greenharbor,3.00
...,...,...,...,...
561833,561833,zuru xshot excelxcess,noshify,16.99
561834,561834,zuru xshot micro,vitalveg,3.50
561835,561835,zuru xshot typhoon thunder,crispcorner,8.50
561836,561836,zzand,snackify,4.79


## Additional Analysis

### Number of Unique Retailers

In [15]:
len(train_features.retailer.unique())

63

There are only `63` unique retailers in the training dataset

### Number of Unique Label Combinations

In [16]:
unique_rows = train_labels.drop(['indoml_id'], axis=1).drop_duplicates()

num_unique_rows = unique_rows.shape[0]

print(f"Number of unique rows: {num_unique_rows}")

Number of unique rows: 12885


So, there are `12885` unique combination of labels, which is around `2%` of the entire entire training samples

### Number of Unique Feature-Label Combinations

In [17]:
train_features_labels = pd.merge(train_features, train_labels, on='indoml_id')

unique_rows_wlabels = train_features_labels.drop(['indoml_id', 'retailer', 'price'], axis=1).drop_duplicates()

num_unique_rows = unique_rows_wlabels.shape[0]

print(f"Number of unique rows: {num_unique_rows}")

Number of unique rows: 561838


There are `561838` unique combinations of train features and labels , which is the entire training size. This means that there is no repetitions in the training samples