# Brands Table Data Exploration

In [1]:
import pandas as pd
from datetime import datetime
import json

In [3]:
# Function to convert Id and timestamp structures to Python objects
def convert_objects(row):
    for key, value in row.items():
        if isinstance(value, dict) and "$oid" in value:
            row[key] = value["$oid"]
        # Convert Date in Unix timestamp format (milliseconds since the epoch)
        elif isinstance(value, dict) and "$date" in value:
            timestamp = value["$date"] / 1000  # Convert milliseconds to seconds
            row[key] = datetime.fromtimestamp(timestamp)
    return row

In [4]:
# Read and convert the JSON file
brands_df = pd.read_json('brands.json', lines=True)
brands_df = brands_df.apply(convert_objects, axis=1)
brands_df.head()

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827


In [5]:
#fix nested structure in cpg column and concat to the brands table
cpg=pd.json_normalize(brands_df['cpg'])
cpg.columns=['cpg_ref', 'cpg_id']
brands_df=pd.concat([brands_df,cpg], axis=1)
brands_df=brands_df.drop(columns=['cpg'])
brands_df.head()

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_ref,cpg_id
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,0.0,,Cogs,601ac114be37ce2ead437550
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,Starbucks,0.0,STARBUCKS,Cogs,5332f5fbe4b03c9a25efd0ba
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176,Cogs,601ac142be37ce2ead437559
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051,Cogs,601ac142be37ce2ead437559
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827,Cogs,5332fa12e4b03c9a25efd1e7


In [6]:
#check brands table information
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   _id           1167 non-null   object 
 1   barcode       1167 non-null   int64  
 2   category      1012 non-null   object 
 3   categoryCode  517 non-null    object 
 4   name          1167 non-null   object 
 5   topBrand      555 non-null    float64
 6   brandCode     933 non-null    object 
 7   cpg_ref       1167 non-null   object 
 8   cpg_id        1167 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 82.2+ KB


In [7]:
#checking null counts
brands_df.isnull().sum()

_id               0
barcode           0
category        155
categoryCode    650
name              0
topBrand        612
brandCode       234
cpg_ref           0
cpg_id            0
dtype: int64

In [8]:
#checking null percentages
brands_df.isnull().mean()

_id             0.000000
barcode         0.000000
category        0.132819
categoryCode    0.556984
name            0.000000
topBrand        0.524422
brandCode       0.200514
cpg_ref         0.000000
cpg_id          0.000000
dtype: float64

In [9]:
#cheacking for duplicated records
brands_df[brands_df.duplicated()]

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_ref,cpg_id


In [10]:
#checking unique counts for fields that should have uniwue values
print(brands_df["_id"].nunique())
print(brands_df["barcode"].nunique())
print(brands_df["name"].nunique())

1167
1160
1156


In [11]:
#checking for duplicated barcodes
duplicated_barcodes=brands_df[brands_df['barcode'].duplicated()]
brands_df[brands_df['barcode'].isin(duplicated_barcodes['barcode'])].sort_values('barcode')

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_ref,cpg_id
467,5c409ab4cd244a3539b84162,511111004790,Baking,,alexa,1.0,ALEXA,Cogs,55b62995e4b0d8e685c14213
1071,5cdacd63166eb33eb7ce0fa8,511111004790,Condiments & Sauces,,Bitten Dressing,,BITTEN,Cogs,559c2234e4b06aca36af13c6
152,5c45f91b87ff3552f950f027,511111204923,Grocery,,Brand1,1.0,0987654321,Cogs,5c45f8b087ff3552f950f026
536,5d6027f46d5f3b23d1bc7906,511111204923,Snacks,,CHESTER'S,,CHESTERS,Cogs,5332f5fbe4b03c9a25efd0ba
20,5c4699f387ff3577e203ea29,511111305125,Baby,,Chris Image Test,,CHRISIMAGE,Cogs,55b62995e4b0d8e685c14213
651,5d642d65a3a018514994f42d,511111305125,Magazines,,Rachael Ray Everyday,,511111305125,Cogs,5d5d4fd16d5f3b23d1bc7905
129,5a7e0604e4b0aedb3b84afd3,511111504139,Beverages,,Chris Brand XYZ,,CHRISXYZ,Cogs,55b62995e4b0d8e685c14213
299,5a8c33f3e4b07f0a2dac8943,511111504139,Grocery,,Pace,0.0,PACE,Cogs,5a734034e4b0d58f376be874
9,5c408e8bcd244a1fdb47aee7,511111504788,Baking,,test,,TEST,Cogs,59ba6f1ce4b092b29c167346
412,5ccb2ece166eb31bbbadccbe,511111504788,Condiments & Sauces,,The Pioneer Woman,,PIONEER WOMAN,Cogs,559c2234e4b06aca36af13c6


In [12]:
#checking duplicated brand names
duplicated_names=brands_df[brands_df['name'].duplicated()]
brands_df[brands_df['name'].isin(duplicated_names['name'])].sort_values('name')

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_ref,cpg_id
848,585a961fe4b03e62d1ce0e76,511111701781,Snacks,,Baken-Ets,1.0,BAKEN-ETS,Cogs,5332f5fbe4b03c9a25efd0ba
574,5d9d08d1a60b87376833e348,511111605546,Snacks,,Baken-Ets,,BAKEN ETS,Cogs,5332f5fbe4b03c9a25efd0ba
140,5a4d23dae4b0bcb2c74ea77e,511111000518,Beverages,,Caleb's Kola,0.0,CALEB'S KOLA,Cogs,5332f5fbe4b03c9a25efd0ba
740,5d601d74a3a018514994f422,511111004912,Snacks,,Caleb's Kola,,CALEBS KOLA,Cogs,53e10d6368abd3c7065097cc
1007,5d658ffa6d5f3b23d1bc7914,511111205227,,,Diabetic Living Magazine,,,Cogs,53e10d6368abd3c7065097cc
1006,5d66d597a3a018093ab34726,511111805298,Magazines,,Diabetic Living Magazine,,511111805298,Cogs,5d5d4fd16d5f3b23d1bc7905
1163,5dc1fca91dda2c0ad7da64ae,511111706328,Breakfast & Cereal,,Dippin Dots® Cereal,,DIPPIN DOTS CEREAL,Cogs,53e10d6368abd3c7065097cc
1081,5dc2d9d4a60b873d6b0666d2,511111206330,Breakfast & Cereal,,Dippin Dots® Cereal,,DIPPIN DOTS,Cogs,5332f5f3e4b03c9a25efd0ae
194,5d6415d5a3a018514994f429,511111605058,Magazines,,Health Magazine,,511111605058,Cogs,5d5d4fd16d5f3b23d1bc7905
596,5f298852be37ce7958c5952d,511111915287,Magazines,MAGAZINES,Health Magazine,,HEALTH,Cogs,5d66b9dcee7f2d201c7281cd


In [15]:
#checking duplicated brandcodes
duplicated_brandcodes=brands_df[~brands_df['brandCode'].isnull()]
duplicated_brandcodes=duplicated_brandcodes[duplicated_brandcodes['brandCode'].duplicated()]
duplicated_brandcodes.head()

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_ref,cpg_id
153,58861c7d4e8d0d20bc42c4d6,511111601449,Snacks,,Jell-O Refrigerated Pudding & Gelatin,0.0,,Cogs,559c2234e4b06aca36af13c6
163,57ebc2ace4b0ac389136a346,511111801962,Deli,,P3,0.0,,Cogs,559c2234e4b06aca36af13c6
188,58b59989e4b0857c2ddb7255,511111400998,Beer Wine Spirits,,Redd's Wicked,0.0,,Cogs,5332f709e4b03c9a25efd0f1
234,58b5988ce4b0857c2ddb7252,511111301028,Beer Wine Spirits,,Henry's Hard Sparkling,0.0,,Cogs,5332f709e4b03c9a25efd0f1
236,57ebc125e4b0ac389136a33b,511111302063,Grocery,,Kraft Macaroni & Cheese,0.0,,Cogs,559c2234e4b06aca36af13c6


In [16]:
#removing records with empty strings for brandcode and checking records
duplicated_brandcodes=duplicated_brandcodes[duplicated_brandcodes['brandCode']!='']
brands_df[brands_df['brandCode'].isin(duplicated_brandcodes['brandCode'])]

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg_ref,cpg_id
628,5bd2011f90fa074576779a17,511111704652,Baby,,Huggies,0.0,HUGGIES,Cogs,550b2565e4b001d5e9e4146f
1036,5db32879ee7f2d6de4248976,511111112938,Baby,BABY,GoodNites,1.0,GOODNITES,Cogs,55b62995e4b0d8e685c14213
1074,5c7d9cb395144c337a3cbfbb,511111707202,Baby,BABY,Huggies,1.0,HUGGIES,Cogs,5459429be4b0bfcb1e864082
1079,5bd200fc965c7d66d92731eb,511111204640,Baby,,Goodnites,0.0,GOODNITES,Cogs,550b2565e4b001d5e9e4146f


In [17]:
#checking records where brandcodes have an empty string
len(brands_df[brands_df['brandCode']==''])

35

In [18]:
#checking other columns for empty strings
print(len(brands_df[brands_df['category']=='']))
print(len(brands_df[brands_df['categoryCode']=='']))
print(len(brands_df[brands_df['name']=='']))
print(len(brands_df[brands_df['topBrand']=='']))
print(len(brands_df[brands_df['cpg_ref']=='']))

0
0
0
0
0


In [19]:
#checking value counts for topBrand
brands_df['topBrand'].value_counts()

topBrand
0.0    524
1.0     31
Name: count, dtype: int64

In [20]:
#checking value counts of category
brands_df['category'].value_counts()

category
Baking                         369
Beer Wine Spirits               90
Snacks                          75
Candy & Sweets                  71
Beverages                       63
Magazines                       44
Health & Wellness               44
Breakfast & Cereal              40
Grocery                         39
Dairy                           33
Condiments & Sauces             27
Frozen                          24
Personal Care                   20
Baby                            18
Canned Goods & Soups            12
Beauty                           9
Cleaning & Home Improvement      6
Deli                             6
Beauty & Personal Care           6
Household                        5
Bread & Bakery                   5
Dairy & Refrigerated             5
Outdoor                          1
Name: count, dtype: int64

In [21]:
#checking value counts of categoryCode
brands_df['categoryCode'].value_counts()

categoryCode
BAKING                           359
CANDY_AND_SWEETS                  71
BEER_WINE_SPIRITS                 31
HEALTHY_AND_WELLNESS              14
GROCERY                           11
BABY                               7
CLEANING_AND_HOME_IMPROVEMENT      6
BREAD_AND_BAKERY                   5
DAIRY_AND_REFRIGERATED             5
PERSONAL_CARE                      4
BEVERAGES                          1
OUTDOOR                            1
MAGAZINES                          1
FROZEN                             1
Name: count, dtype: int64

### Data quality Issues: Brands Table

1. Null Values
    There's a large percentage of null values for the following fields
       
        category        13.3%
        categoryCode    55.7%
        topBrand        52.4%
        brandCode       20.1%    
        
    The large number of missing values can cause issues with analysis and marketing efforts.  
    categoryCode and brandCode are important to connect this table with other tables like receipt_items, which has a brandCode field to pull in brand information.  
    The majority of null values for categoryCode are from records that don't have category as 'Baking' or 'Candy & Sweets'.  
    topBrand is an important field to imporve marketing efforts. The majority of these null records will have an indicator of '0.0'; however, we will miss out on marketing opportunities for the brands that should be indicated as a top brand.  
    
    Reccomendations:
    
    Null fields for category and topBrand need to be corrected on data entry.  
    Null fields for categoryCode can be substituted using the categoryCode for other records with the same category; however there will still be some categories with null values for the categoryCode. A new categoryCode will need to be made for these records.
    Null fields in the brandCode can be substituted with a value using the name.  
    
    note: brandCode and categoryCode issues should be corrected during the ETL process accross all tables with the features as these are joinable keys and should be standardized accross the tables  
        

2. Duplicated records  

    There are no fully duplicated records in this data; however there are duplicates in the following fields: barcode, name, brandCode.  
    
    barcode: There are 7 barcodes that are duplicated with differrent brand information. This should be discussed with the stakeholder to see if this is a data issue or if the barcode is correct.  

    name: There are 11 brand names that are duplicated with varying information in the other columns. Some have different topBrand indicators, brandCodes, category, or categoryCodes.  

    brandcode: There are 2 brandCodes that are duplicated. The main difference in the duplicated records is the topBrand indicator.

