# Data Transformation

## 1. Loading Data

In [1]:
import pandas as pd
import os

In [2]:
directory = 'data/'
file_pattern = 'hyvee_'
dataframes = []

# Loop through each file in the directory
for file in os.listdir(directory):
    if file.startswith(file_pattern) and file.endswith('.csv'):
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)

In [3]:

# Combine all DataFrames into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
0,INV-16685400042,2019-01-02T00:00:00.000,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1031100.0,AMERICAN VODKAS,301.0,FIFTH GENERATION INC,38177,TITOS HANDMADE VODKA,12.67,19.01,36,684.36
1,INV-16685400057,2019-01-02T00:00:00.000,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1062200.0,WHITE RUM,434.0,LUXCO INC,46351,HAWKEYE LIGHT RUM,4.34,6.51,12,78.12
2,INV-16679100094,2019-01-02T00:00:00.000,2549,HY-VEE FOOD STORE / INDIANOLA,INDIANOLA,50125.0,WARREN,1081200.0,CREAM LIQUEURS,305.0,MHW LTD,73052,RUMCHATA,7.0,10.5,2,21.0
3,INV-16685400104,2019-01-02T00:00:00.000,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1022100.0,MIXTO TEQUILA,434.0,LUXCO INC,87937,JUAREZ TEQUILA SILVER,7.83,11.75,12,141.0
4,INV-16682800080,2019-01-02T00:00:00.000,2513,HY-VEE FOOD STORE #2 / IOWA CITY,IOWA CITY,52240.0,JOHNSON,1031100.0,AMERICAN VODKAS,297.0,LAIRD & COMPANY,35926,FIVE O'CLOCK VODKA PET,3.37,5.06,12,60.72


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3727001 entries, 0 to 3727000
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   invoice_line_no      object 
 1   date                 object 
 2   store                int64  
 3   name                 object 
 4   city                 object 
 5   zipcode              float64
 6   county               object 
 7   category             float64
 8   category_name        object 
 9   vendor_no            float64
 10  vendor_name          object 
 11  itemno               int64  
 12  im_desc              object 
 13  state_bottle_cost    float64
 14  state_bottle_retail  float64
 15  sale_bottles         int64  
 16  sale_dollars         float64
dtypes: float64(6), int64(3), object(8)
memory usage: 483.4+ MB


# 2. Cleaning Data

### 2.1. Revoving Duplicate Rows

In [6]:
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 5099; Percentage of duplicate rows: 0.14%


In [7]:
# Despite the potential for duplicates arising from reasons other than data entry errors, 
# I'm opting to remove them as they constitute a minimal proportion of the dataset (only 0.14%). 
# This decision is based on the assumption that the impact on data integrity will be negligible while simplifying further analysis.

df = df.drop_duplicates()
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 0; Percentage of duplicate rows: 0.00%


### 2.2. Handelling Missing Values

In [8]:
def missing_values_table(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input is not a pandas DataFrame")

    num_na = df.isnull().sum()
    perc_na = (num_na / len(df)) * 100

    df_na = pd.DataFrame({
        'Missing Values': num_na,
        '% of Total Values': perc_na
    })

    df_na = df_na[df_na['Missing Values'] != 0]

    return df_na

missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values
city,2105,0.056557
zipcode,2105,0.056557
county,2105,0.056557
category,1317,0.035385
category_name,1317,0.035385
vendor_no,1,2.7e-05
vendor_name,1,2.7e-05


All of the missing data pertains to contextual information. 
Despite its relatively small proportion, these rows include crucial sales data, pivotal for our analysis. 
Therefore, we will retain these rows by substituting missing values with placeholders, ensuring that valuable sales insights are preserved.

In [9]:
placeholders = {
    'city'          : 'UNKNOWN',  
    'county'        : 'UNKNOWN',  
    'category_name' : 'UNKNOWN',
    'vendor_name'   : 'UNKNOWN',
    'zipcode'       : 99999,
    'category'      : 9999999,
    'vendor_no'     : 999
}

df = df.fillna(value=placeholders)
missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


### 2.3. Converting Data Type

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3721902 entries, 0 to 3727000
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   invoice_line_no      object 
 1   date                 object 
 2   store                int64  
 3   name                 object 
 4   city                 object 
 5   zipcode              float64
 6   county               object 
 7   category             float64
 8   category_name        object 
 9   vendor_no            float64
 10  vendor_name          object 
 11  itemno               int64  
 12  im_desc              object 
 13  state_bottle_cost    float64
 14  state_bottle_retail  float64
 15  sale_bottles         int64  
 16  sale_dollars         float64
dtypes: float64(6), int64(3), object(8)
memory usage: 511.1+ MB


#### 2.3.1. Datetime Columns

In [11]:
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
0,INV-16685400042,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1031100.0,AMERICAN VODKAS,301.0,FIFTH GENERATION INC,38177,TITOS HANDMADE VODKA,12.67,19.01,36,684.36
1,INV-16685400057,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1062200.0,WHITE RUM,434.0,LUXCO INC,46351,HAWKEYE LIGHT RUM,4.34,6.51,12,78.12
2,INV-16679100094,2019-01-02,2549,HY-VEE FOOD STORE / INDIANOLA,INDIANOLA,50125.0,WARREN,1081200.0,CREAM LIQUEURS,305.0,MHW LTD,73052,RUMCHATA,7.0,10.5,2,21.0
3,INV-16685400104,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1022100.0,MIXTO TEQUILA,434.0,LUXCO INC,87937,JUAREZ TEQUILA SILVER,7.83,11.75,12,141.0
4,INV-16682800080,2019-01-02,2513,HY-VEE FOOD STORE #2 / IOWA CITY,IOWA CITY,52240.0,JOHNSON,1031100.0,AMERICAN VODKAS,297.0,LAIRD & COMPANY,35926,FIVE O'CLOCK VODKA PET,3.37,5.06,12,60.72


#### 2.3.2. Integer Columns

##### 2.3.2.1. Invoice Line Number

A tentative step to expolore if Invoice Line Number column can be converted to an integer column.

In [12]:
# Check if all invoice line has the same prefix "INV-""
starts_with_INV = df['invoice_line_no'].str.startswith('INV-').all()

if starts_with_INV:
    print("All rows start with 'INV-'")
else:
    print("Not all rows start with 'INV-'")

Not all rows start with 'INV-'


In [13]:
# The answer is no. There is another prefix "RINV-"
df['invoice_line_no'].str[:4].unique()

array(['INV-', 'RINV'], dtype=object)

In [14]:
# Check if the numbers are unique
df['invoice_line_no'].str.extract('(\d+)').duplicated().sum()

0

In [15]:
# They are unique. Replacing prefixes to numbers.
# Replacement is conducted separately because one prefex is a subset of another
df['invoice_line_no'] = df['invoice_line_no'].replace({'RINV-': '2'}, regex=True)
df['invoice_line_no'] = df['invoice_line_no'].replace({'INV-': '1'}, regex=True)
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
0,116685400042,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1031100.0,AMERICAN VODKAS,301.0,FIFTH GENERATION INC,38177,TITOS HANDMADE VODKA,12.67,19.01,36,684.36
1,116685400057,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1062200.0,WHITE RUM,434.0,LUXCO INC,46351,HAWKEYE LIGHT RUM,4.34,6.51,12,78.12
2,116679100094,2019-01-02,2549,HY-VEE FOOD STORE / INDIANOLA,INDIANOLA,50125.0,WARREN,1081200.0,CREAM LIQUEURS,305.0,MHW LTD,73052,RUMCHATA,7.0,10.5,2,21.0
3,116685400104,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1022100.0,MIXTO TEQUILA,434.0,LUXCO INC,87937,JUAREZ TEQUILA SILVER,7.83,11.75,12,141.0
4,116682800080,2019-01-02,2513,HY-VEE FOOD STORE #2 / IOWA CITY,IOWA CITY,52240.0,JOHNSON,1031100.0,AMERICAN VODKAS,297.0,LAIRD & COMPANY,35926,FIVE O'CLOCK VODKA PET,3.37,5.06,12,60.72


In [16]:
df['invoice_line_no'].duplicated().sum()

0

In [17]:
df['invoice_line_no'] = df['invoice_line_no'].astype('int64')

##### 2.3.2.2. Other ID Columns

In [18]:
# Convert some float cols to int. 
# Define a function to check the existance of decimals first before implementation. 
def safe_to_int(df, columns):
    for col in columns:
        # Check if the column contains any non-integer values
        if df[col].apply(lambda x: x % 1).any():
            print(f"Column '{col}' contains decimal values. Conversion to int skipped.")
        else:
            # Convert to int
            df[col] = df[col].astype(int)
            print(f"Column '{col}' successfully converted to int.")

In [19]:
cols_to_int = ['zipcode', 'category','vendor_no' ]
safe_to_int(df, cols_to_int)

Column 'zipcode' successfully converted to int.
Column 'category' successfully converted to int.
Column 'vendor_no' successfully converted to int.


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3721902 entries, 0 to 3727000
Data columns (total 17 columns):
 #   Column               Dtype         
---  ------               -----         
 0   invoice_line_no      int64         
 1   date                 datetime64[ns]
 2   store                int64         
 3   name                 object        
 4   city                 object        
 5   zipcode              int32         
 6   county               object        
 7   category             int32         
 8   category_name        object        
 9   vendor_no            int32         
 10  vendor_name          object        
 11  itemno               int64         
 12  im_desc              object        
 13  state_bottle_cost    float64       
 14  state_bottle_retail  float64       
 15  sale_bottles         int64         
 16  sale_dollars         float64       
dtypes: datetime64[ns](1), float64(3), int32(3), int64(4), object(6)
memory usage: 468.5+ MB


### 2.4. Validating Data Accuracy

In [21]:
df.describe()

Unnamed: 0,invoice_line_no,date,store,zipcode,category,vendor_no,itemno,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
count,3721902.0,3721902,3721902.0,3721902.0,3721902.0,3721902.0,3721902.0,3721902.0,3721902.0,3721902.0,3721902.0
mean,138280600000.0,2021-06-01 10:46:23.478770944,2639.607,51303.14,1058558.0,288.5785,54000.78,11.94144,17.91327,11.7128,180.5445
min,116678000000.0,2019-01-02 00:00:00,2500.0,50009.0,1011000.0,35.0,159.0,0.33,0.5,-648.0,-9720.0
25%,126105300000.0,2020-03-25 00:00:00,2548.0,50315.0,1012200.0,205.0,27130.0,6.5,9.75,3.0,47.1
50%,136723100000.0,2021-05-18 00:00:00,2596.0,51104.0,1031200.0,260.0,41360.0,9.5,14.25,6.0,94.08
75%,150415300000.0,2022-08-17 00:00:00,2637.0,52402.0,1071100.0,420.0,68011.0,14.75,22.13,12.0,171.0
max,204991500000.0,2023-11-30 00:00:00,10285.0,99999.0,9999999.0,999.0,999940.0,2298.84,3448.26,13200.0,260832.0
std,13938060000.0,,450.219,1543.39,195507.1,151.8065,85861.22,11.61138,17.41529,41.27214,743.5074


In [22]:
# Check the percentage of rows where 'sale_bottles' is negative
len(df[df['sale_bottles']<0])/len(df) * 100

0.029017421737595455

**Note**: While negative sales volumes typically indicate refunds or returns, our analysis will omit these records to maintain data integrity. 

This decision is based on the focus of our study, which requires consistent and positive sales figures to accurately assess trends and patterns.


In [23]:
df=df[df['sale_bottles']>=0]
df.describe()

Unnamed: 0,invoice_line_no,date,store,zipcode,category,vendor_no,itemno,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
count,3720822.0,3720822,3720822.0,3720822.0,3720822.0,3720822.0,3720822.0,3720822.0,3720822.0,3720822.0,3720822.0
mean,138261300000.0,2021-06-01 05:54:06.523268096,2639.583,51303.22,1058553.0,288.5672,53991.46,11.94002,17.91114,11.71962,180.6559
min,116678000000.0,2019-01-02 00:00:00,2500.0,50009.0,1011000.0,35.0,159.0,0.33,0.5,1.0,1.3
25%,126103100000.0,2020-03-24 00:00:00,2548.0,50315.0,1012200.0,205.0,27130.0,6.5,9.75,3.0,47.22
50%,136711500000.0,2021-05-18 00:00:00,2596.0,51104.0,1031200.0,260.0,41360.0,9.5,14.25,6.0,94.08
75%,150408200000.0,2022-08-17 00:00:00,2637.0,52402.0,1071100.0,420.0,68011.0,14.75,22.13,12.0,171.0
max,204944400000.0,2023-11-30 00:00:00,10285.0,99999.0,9999999.0,999.0,999940.0,2298.84,3448.26,13200.0,260832.0
std,13894160000.0,,450.104,1543.522,195518.3,151.799,85816.66,11.55269,17.32725,41.2732,743.5482


In [24]:
# Check the accuracy of "sale_dollars"
df['revenue'] = df['state_bottle_retail'] * df['sale_bottles']
len(df[df['sale_dollars']!=df['revenue']])/len(df) * 100

13.12325609771174

Observation: Approximately 13% of entries in 'sale_bottles' exhibit discrepancies when compared to our computed 'revenue' column. The underlying cause of these discrepancies remains unidentified. 

However, for consistency in our analysis, we will continue using a standardized algorithm to compute revenue across all records. 

In [25]:
df.drop(columns='sale_dollars', axis=1, inplace=True)
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles,revenue
0,116685400042,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001,DUBUQUE,1031100,AMERICAN VODKAS,301,FIFTH GENERATION INC,38177,TITOS HANDMADE VODKA,12.67,19.01,36,684.36
1,116685400057,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001,DUBUQUE,1062200,WHITE RUM,434,LUXCO INC,46351,HAWKEYE LIGHT RUM,4.34,6.51,12,78.12
2,116679100094,2019-01-02,2549,HY-VEE FOOD STORE / INDIANOLA,INDIANOLA,50125,WARREN,1081200,CREAM LIQUEURS,305,MHW LTD,73052,RUMCHATA,7.0,10.5,2,21.0
3,116685400104,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001,DUBUQUE,1022100,MIXTO TEQUILA,434,LUXCO INC,87937,JUAREZ TEQUILA SILVER,7.83,11.75,12,141.0
4,116682800080,2019-01-02,2513,HY-VEE FOOD STORE #2 / IOWA CITY,IOWA CITY,52240,JOHNSON,1031100,AMERICAN VODKAS,297,LAIRD & COMPANY,35926,FIVE O'CLOCK VODKA PET,3.37,5.06,12,60.72


## 3. Feature Engineering

Creating a new column to indicate store categories based on the 'name' column.

In [26]:
df['store_format'] = df['name']
df[df['store_format'].str.contains('/', na=False)]['store_format'].unique()

array(['HY-VEE FOOD STORE / DUBUQUE', 'HY-VEE FOOD STORE / INDIANOLA',
       'HY-VEE FOOD STORE #2 / IOWA CITY', 'HY-VEE #3 / DUBUQUE',
       'HY-VEE FOOD STORE #5 / DES MOINES', 'HY-VEE #7 / CEDAR RAPIDS',
       'HY-VEE FOOD AND DRUG #6 / CEDAR RAPIDS', 'HY-VEE #2 / DUBUQUE',
       'HY-VEE / DRUGTOWN #1 / CEDAR RAPIDS',
       'HY-VEE FOOD STORE / IOWA CITY', 'HY-VEE FOOD STORE #2 / WATERLOO',
       'HY-VEE DRUGSTORE / IOWA CITY', 'HY-VEE / WAUKEE',
       'HY-VEE WINE AND SPIRITS / STORM LAKE',
       'HY-VEE FOOD STORE #2 / DES MOINES',
       'HY-VEE WINE AND SPIRITS / BOONE',
       'HY-VEE FOOD STORE #2 / STATE ANKENY',
       'HY-VEE FOOD STORE / CEDAR FALLS',
       'HY-VEE WINE AND SPIRITS / LEMARS',
       'HY-VEE FOOD STORE / URBANDALE',
       'HY-VEE FOOD STORE #3 / CEDAR RAPIDS',
       'HY-VEE FOOD STORE / JOHNSTON', 'HY-VEE FOOD STORE / SHELDON',
       'HY-VEE DRUGSTORE #4 / CEDAR RAPIDS',
       'HY-VEE FOOD STORE / SIOUX CENTER',
       'HY-VEE DRUGSTORE #5 / CE

The substring following the '/' in each entry of the 'store_format' column appears to denote location information. 

This detail is not relevant for identifying the store format itself. 

Consequently, we will focus on the data preceding the '/' for a more accurate characterization of store formats.

In [27]:
# Split each string in 'store_format' at the first '/' and keep only the first part
df['store_format'] = df['store_format'].str.split('/', n=1).str[0]
df['store_format'].unique()

array(['HY-VEE FOOD STORE ', 'HY-VEE FOOD STORE #2 ', 'HY-VEE #3 ',
       'HY-VEE FOOD STORE #5 ', 'HY-VEE #7 ', 'HY-VEE FOOD AND DRUG #6 ',
       'HY-VEE #2 ', 'HY-VEE ', 'HY-VEE DRUGSTORE ',
       'HY-VEE WINE AND SPIRITS ', 'HY-VEE FOOD STORE #3 ',
       'HY-VEE DRUGSTORE #4 ', 'HY-VEE DRUGSTORE #5 ',
       'HY-VEE FOOD STORE #1 ', 'HY-VEE WINE & SPIRITS #2 ', 'HY-VEE #5 ',
       'HY-VEE #4 ', 'HY-VEE FOOD STORE #4 ', 'HY-VEE DRUGSTORE #6 ',
       'HY-VEE FORT DODGE WINE AND SPIRITS', 'HY-VEE  #2 ',
       'HY-VEE FOOD STORE #1636 ', 'HY-VEE MAINSTREET ',
       'HY-VEE FOOD AND DRUG ', 'HY-VEE #3 FOOD & DRUGSTORE ',
       'HY-VEE DRUGSTORE #2 ', 'HY-VEE WINE & SPIRITS #1 ',
       'HY-VEE STORE ', 'HY-VEE #2 FOOD STORE ', 'HY-VEE  ',
       'HY-VEE WINE & SPIRITS ', 'HY-VEE #1044 ', 'HY-VEE # 6',
       'HY-VEE FAST & FRESH EXPRESS ', 'HY-VEE FAST & FRESH ',
       'HY-VEE FAST & FRESH', 'HY-VEE FULFILLMENT CENTER', 'HY-VEE GAS ',
       'HY-VEE C-STORE ', 'HY-VEE OTTUMWA#2

In [28]:
# Removing all numbers and symbols
df['store_format'] = df['store_format'].str.replace('[0-9#()]+', '', regex=True)
df['store_format'].unique()

array(['HY-VEE FOOD STORE ', 'HY-VEE FOOD STORE  ', 'HY-VEE  ',
       'HY-VEE FOOD AND DRUG  ', 'HY-VEE ', 'HY-VEE DRUGSTORE ',
       'HY-VEE WINE AND SPIRITS ', 'HY-VEE DRUGSTORE  ',
       'HY-VEE WINE & SPIRITS  ', 'HY-VEE FORT DODGE WINE AND SPIRITS',
       'HY-VEE   ', 'HY-VEE MAINSTREET ', 'HY-VEE FOOD AND DRUG ',
       'HY-VEE  FOOD & DRUGSTORE ', 'HY-VEE STORE ',
       'HY-VEE  FOOD STORE ', 'HY-VEE WINE & SPIRITS ',
       'HY-VEE FAST & FRESH EXPRESS ', 'HY-VEE FAST & FRESH ',
       'HY-VEE FAST & FRESH', 'HY-VEE FULFILLMENT CENTER', 'HY-VEE GAS ',
       'HY-VEE C-STORE ', 'HY-VEE OTTUMWA', 'HY-VEE -GARNER',
       'HY-VEE - FOREST CITY', 'HY-VEE FAST & FRESH EXPRESS- BETTENDORF',
       'HY-VEE FAST & FRESH - KNOXVILLE', 'HY-VEE C-STRORE - DOUGLAS',
       'HY-VEE FAST & FRESH - DES MOINES',
       'HY-VEE FAST & FRESH EXPRESS- CRESTON',
       'HY-VEE C-STORE - EAST HICKMAN', 'HY-VEE GAS  ',
       'HY-VEE FAST & FRESH EXPRESS -OSCELOA',
       'HY-VEE DOLLAR FRESH -

In [29]:
# Remove leading and trailing spaces
df['store_format'] = df['store_format'].str.strip()

# Replace consecutive spaces with a single space
df['store_format'] = df['store_format'].str.replace('\s+', ' ', regex=True)

df['store_format'].unique()

array(['HY-VEE FOOD STORE', 'HY-VEE', 'HY-VEE FOOD AND DRUG',
       'HY-VEE DRUGSTORE', 'HY-VEE WINE AND SPIRITS',
       'HY-VEE WINE & SPIRITS', 'HY-VEE FORT DODGE WINE AND SPIRITS',
       'HY-VEE MAINSTREET', 'HY-VEE FOOD & DRUGSTORE', 'HY-VEE STORE',
       'HY-VEE FAST & FRESH EXPRESS', 'HY-VEE FAST & FRESH',
       'HY-VEE FULFILLMENT CENTER', 'HY-VEE GAS', 'HY-VEE C-STORE',
       'HY-VEE OTTUMWA', 'HY-VEE -GARNER', 'HY-VEE - FOREST CITY',
       'HY-VEE FAST & FRESH EXPRESS- BETTENDORF',
       'HY-VEE FAST & FRESH - KNOXVILLE', 'HY-VEE C-STRORE - DOUGLAS',
       'HY-VEE FAST & FRESH - DES MOINES',
       'HY-VEE FAST & FRESH EXPRESS- CRESTON',
       'HY-VEE C-STORE - EAST HICKMAN',
       'HY-VEE FAST & FRESH EXPRESS -OSCELOA',
       'HY-VEE DOLLAR FRESH - TOLEDO', 'HY-VEE GAS - WDM',
       'HY-VEE GAS - PLEASANT HILL', 'HY-VEE DOLLAR FRESH - EMMETSBURG',
       'HY-VEE FAST AND FRESH', 'HY-VEE C-STORE - ANKENY',
       'HY-VEE FAST AND FRESH - URBANDALE',
       'HY-VEE

In [30]:
# Standardizing the names
replacements = {
    'FOOD STORE': 'HY-VEE',
    'FOOD AND DRUG': 'HY-VEE FOOD & DRUG',
    'DRUGSTORE': 'HY-VEE FOOD & DRUG',
    'FOOD & DRUGSTORE': 'HY-VEE FOOD & DRUG',
    'WINE AND SPIRITS': 'HY-VEE WINE & SPIRITS',
    'WINE & SPIRITS': 'HY-VEE WINE & SPIRITS',
    'FAST & FRESH EXPRESS': 'HY-VEE FAST & FRESH',
    'FAST & FRESH': 'HY-VEE FAST & FRESH',
    'FAST AND FRESH': 'HY-VEE FAST & FRESH',
    'GAS': 'HY-VEE GAS',
    'GASE': 'HY-VEE GAS',
    'C-STORE': 'HY-VEE C-STORE',
    'C-STRORE': 'HY-VEE C-STORE',
    'C STORE': 'HY-VEE C-STORE',
    'DOLLAR FRESH': 'HY-VEE DOLLAR FRESH'
}

for key, value in replacements.items():
    df.loc[df['store_format'].str.contains(key, na=False, case=False), 'store_format'] = value

df['store_format'].unique()

array(['HY-VEE FOOD & DRUG', 'HY-VEE', 'HY-VEE WINE & SPIRITS',
       'HY-VEE MAINSTREET', 'HY-VEE STORE', 'HY-VEE FAST & FRESH',
       'HY-VEE FULFILLMENT CENTER', 'HY-VEE GAS', 'HY-VEE C-STORE',
       'HY-VEE OTTUMWA', 'HY-VEE -GARNER', 'HY-VEE - FOREST CITY',
       'HY-VEE DOLLAR FRESH', 'HY-VEE WDM HEALTH MARKET'], dtype=object)

Upcoming Step: To clarify the store formats associated with ambiguous store names, I will utilize online resources. 

This approach involves cross-referencing the addresses of these stores with external data sources to accurately determine their specific format types. 

In [31]:
df[df['store_format']=='HY-VEE STORE']['store'].unique()

array([2612], dtype=int64)

In [32]:
df[df['store_format']=='HY-VEE FULFILLMENT CENTER']['store'].unique()

array([2680], dtype=int64)

In [33]:
df[df['store_format']=='HY-VEE OTTUMWA']['store'].unique()

array([2596], dtype=int64)

In [34]:
df[df['store_format']=='HY-VEE C-STORE']['store'].unique()

array([2681, 2689, 2688, 2696, 2699, 2721, 6200, 6193, 6216], dtype=int64)

In [35]:
df[df['store_format']=='HY-VEE -GARNER']['store'].unique()

array([2685], dtype=int64)

In [36]:
df[df['store_format']=='HY-VEE - FOREST CITY']['store'].unique()

array([2684], dtype=int64)

In [37]:
df[df['store_format']=='HY-VEE WDM HEALTH MARKET']['store'].unique()

array([2682], dtype=int64)

In [38]:
df[df['store_format']=='HY-VEE MAINSTREET']['store'].unique()

array([2667], dtype=int64)

In [39]:
# Following are the results

store_format_mapping = {
    2612: 'HY-VEE',
    2681: 'HY-VEE',
    2680: 'OTHER', # Training and Education Center
    2596: 'HY-VEE GAS',
    2685: 'HY-VEE FOOD & DRUG',
    2684: 'HY-VEE',
    2682: 'HY-VEE WINE & SPIRITS',
    2667: 'HY-VEE FOOD & DRUG',
    2689: 'HY-VEE FAST & FRESH',
    2688: 'OTHER', # CAR WASH
    2696: 'HY-VEE FAST & FRESH',
    2699: 'HY-VEE FAST & FRESH',
    2721: 'HY-VEE FAST & FRESH',
    6200: 'HY-VEE FAST & FRESH',
    6193: 'HY-VEE FAST & FRESH',
    6216: 'HY-VEE GAS'
}

for store_id, format_name in store_format_mapping.items():
    df.loc[df['store'] == store_id, 'store_format'] = format_name

df['store_format'].unique()

array(['HY-VEE FOOD & DRUG', 'HY-VEE', 'HY-VEE WINE & SPIRITS',
       'HY-VEE GAS', 'HY-VEE FAST & FRESH', 'OTHER',
       'HY-VEE DOLLAR FRESH'], dtype=object)

In [40]:
len(df[df['store_format'].isna()])/len(df)

0.0

In [None]:
format_replacements = {
    'HY-VEE': 'Grocery Store',
    'HY-VEE GAS': 'Gas Station',
    'HY-VEE FOOD & DRUG': 'Pharmacy',
    'HY-VEE WINE & SPIRITS': 'Wine & Spirits',
    'HY-VEE FAST & FRESH': 'Fast & Fresh',
    'HY-VEE DOLLAR FRESH': 'Dollar Fresh Market',
    'OTHER': 'Other'
}

df['store_format'] = df['store_format'].replace(format_replacements)

In [None]:
df.to_csv('data/clean_hyvee.csv', index=False)