# Feature Engineering

In [1]:
import pandas as pd
import json
from datetime import datetime, timedelta
import pymysql

In [2]:
# MySQL connection settings
with open('config\mysql_config.json') as f:
  mysql_config = json.load(f)

host = mysql_config['hostname']
user = mysql_config['username']
password = mysql_config['password']

## 1. Store Format

Creating a new column to indicate store formats based on the 'name' column.

In [3]:
# Loading data from MySQL
conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

sql_query = "SELECT store_id, store_name FROM stores"

df = pd.read_sql(sql_query, conn_int)

cursor_int.close()
conn_int.close()

  df = pd.read_sql(sql_query, conn_int)


In [61]:
df

Unnamed: 0,store_id,store_name


In [62]:
df['store_format'] = df['store_name']
df[df['store_format'].str.contains('/', na=False)]['store_format'].unique()

array([], dtype=object)

The substring following the '/' in each entry of the 'store_format' column appears to denote location information. 

This detail is not relevant for identifying the store format itself. 

Consequently, we will focus on the data preceding the '/' for a more accurate characterization of store formats.

In [63]:
# Stripe 'HY-VEE' and content after first '/'or '-'
df['store_format'] = df['store_format'].str.replace('HY-VEE', 'HY VEE', case=False) \
                                        .str.replace('C-STORE', 'C STORE', case=False) \
                                        .str.split('/|-', n=1).str[0].str.strip()
df['store_format'].unique()

array([], dtype=object)

In [64]:
# Removing all numbers and symbols
df['store_format'] = df['store_format'].str.replace('[0-9#()]+', '', regex=True)
df['store_format'].unique()

array([], dtype=object)

In [65]:
# Remove leading and trailing spaces
df['store_format'] = df['store_format'].str.strip()

# Replace consecutive spaces with a single space
df['store_format'] = df['store_format'].str.replace('\s+', ' ', regex=True)

df['store_format'].unique()

array([], dtype=object)

In [66]:
# Standardizing the names
replacements = {
    'FOOD STORE'          : 'Grocery Store',
    'FOOD AND DRUG'       : 'Pharmacy',
    'DRUGSTORE'           : 'Pharmacy',
    'FOOD & DRUGSTORE'    : 'Pharmacy',
    'WINE AND SPIRITS'    : 'Liquor Store',
    'WINE & SPIRITS'      : 'Liquor Store',
    'FAST & FRESH EXPRESS': 'Convenience Store',
    'FAST & FRESH'        : 'Convenience Store',
    'FAST AND FRESH'      : 'Convenience Store',
    'C STORE'             : 'Convenience Store',
}

with open('dicts/store_format_map.json', 'w') as file:
    json.dump(replacements, file)

In [67]:
for key, value in replacements.items():
    df.loc[df['store_format'].str.contains(key, na=False, case=False), 'store_format'] = value

df['store_format'].unique()

array([], dtype=object)

In [68]:
# Replace 'HY VEE' by 'Grocery Store' and set 'Other' for all values not in the dictionary
df['store_format'] = df['store_format'].apply(lambda x: 'Grocery Store' if x == 'HY VEE' else ('Other' if x not in replacements.values() else x))
df['store_format'].unique()

array([], dtype=object)

In [69]:
df.head()

Unnamed: 0,store_id,store_name,store_format


In [4]:
# Streamline the process for building pipeline
df['store_format'] = df['store_name']

df['store_format'] = (
    df['store_format'].str.replace('HY-VEE', 'HY VEE', case=False, regex=True)
                      .str.replace('C-STORE', 'C STORE', case=False, regex=True)
                      .str.split('/|-', n=1).str[0]
                      .str.replace('[0-9#()]+', '', regex=True)
                      .str.strip()
                      .str.replace('\s+', ' ', regex=True)
)

with open('dicts/store_format_map.json', 'r') as f:
    store_format_map = json.load(f)

# Convert keys to lowercase for case-insensitive matching
replacements_lower = {k.lower(): v for k, v in store_format_map.items()}

# Use a function to replace based on the lowercase match
def apply_replacements(x):
    x_lower = x.lower()
    for key, value in replacements_lower.items():
        if key in x_lower:
            return value
    if x == 'HY VEE':
        return 'Grocery Store'
    return 'Other' if all(x_lower != val.lower() for val in store_format_map.values()) else x

df['store_format'] = df['store_format'].apply(apply_replacements)

In [5]:
df

Unnamed: 0,store_id,store_name,store_format
0,2500,HY-VEE FOOD STORE #1 (1013) / AMES,Grocery Store
1,2501,HY-VEE #2 (1018) / AMES,Grocery Store
2,2502,HY-VEE WINE AND SPIRITS (1022) / ANKENY,Liquor Store
3,2505,HY-VEE WINE AND SPIRITS (1038) / BOONE,Liquor Store
4,2506,HY-VEE #2 (1044) / BURLINGTON,Grocery Store
...,...,...,...
190,10025,HY-VEE FAST AND FRESH #4 (7583) / URBANDALE,Convenience Store
191,10034,HY-VEE FAST AND FRESH #5 (7584) / URBANDALE,Convenience Store
192,10167,HY-VEE / ELDRIDGE,Grocery Store
193,10189,HY-VEE / KNOXVILLE,Grocery Store


In [9]:
# Load to db
try:
    conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
    
    with conn_int.cursor() as cursor_int:
        # Check if the column exists
        cursor_int.execute("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_schema = 'INT_HYVEE' 
            AND table_name = 'stores' 
            AND column_name = 'store_format';
        """)
        if cursor_int.fetchone():
            # If the column exists, drop it
            cursor_int.execute("ALTER TABLE stores DROP COLUMN store_format;")
        
        cursor_int.execute("ALTER TABLE stores ADD COLUMN store_format VARCHAR(255);")

        sql_update = "UPDATE stores SET store_format = %s WHERE store_id = %s;"

        for index, row in df.iterrows():
            cursor_int.execute(sql_update, (row['store_format'], row['store_id']))

        conn_int.commit()
except pymysql.Error as e:
    print(f"Database error occurred: {e}")
finally:
    conn_int.close()


## 2. Liquor Type

Creating a new column to indicate liquor types based on the 'category' and 'category_code' columns.

In [10]:
conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

sql_query = "SELECT category_code, category FROM items"

df = pd.read_sql(sql_query, conn_int)

cursor_int.close()
conn_int.close()

  df = pd.read_sql(sql_query, conn_int)


In [73]:
df.drop_duplicates().sort_values(by='category_code', ascending=True)

Unnamed: 0,category_code,category


In [74]:
# Matching category codes with liquor types
liquor_type_map = {
    '101': 'Whiskey',
    '102': 'Tequila',
    '103': 'Vodka',
    '104': 'Gin',
    '105': 'Brandy',
    '106': 'Rum',
    '107': 'Cocktails',
    '108': 'Liqueur',
    '109': 'Neutral Grain/Distilled Spirits'
}

with open('dicts/liquor_type_map.json', 'w') as file:
    json.dump(liquor_type_map, file)

In [22]:
df['liquor_type'] = df['category']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['liquor_type'] = df['category']


Unnamed: 0,category_code,category,liquor_type
0,1081200,CREAM LIQUEURS,CREAM LIQUEURS
1,1701100,TEMPORARY & SPECIALTY PACKAGES,TEMPORARY & SPECIALTY PACKAGES
2,1012300,SINGLE MALT SCOTCH,SINGLE MALT SCOTCH
5,1012200,SCOTCH WHISKIES,SCOTCH WHISKIES
9,1062400,SPICED RUM,SPICED RUM
11,1071000,COCKTAILS/RTD,COCKTAILS/RTD
25,1052100,IMPORTED BRANDIES,IMPORTED BRANDIES
96,1011100,BLENDED WHISKIES,BLENDED WHISKIES
146,1012100,CANADIAN WHISKIES,CANADIAN WHISKIES
255,1011600,STRAIGHT RYE WHISKIES,STRAIGHT RYE WHISKIES


In [24]:
# Streamlining the process

df['liquor_type'] = df['category']

with open('dicts/liquor_type_map.json', 'r') as f:
    liquor_type_map = json.load(f)

def match_liquor_type(number):
    number_str = str(number)[:3] 
    return liquor_type_map.get(number_str, 'Other') 

df = df.drop_duplicates()
df['liquor_type'] = df['category_code'].apply(match_liquor_type)

In [25]:
df

Unnamed: 0,category_code,category,liquor_type
0,1081200,CREAM LIQUEURS,Liqueur
1,1701100,TEMPORARY & SPECIALTY PACKAGES,Other
2,1012300,SINGLE MALT SCOTCH,Whiskey
5,1012200,SCOTCH WHISKIES,Whiskey
9,1062400,SPICED RUM,Rum
11,1071000,COCKTAILS/RTD,Cocktails
25,1052100,IMPORTED BRANDIES,Brandy
96,1011100,BLENDED WHISKIES,Whiskey
146,1012100,CANADIAN WHISKIES,Whiskey
255,1011600,STRAIGHT RYE WHISKIES,Whiskey


In [26]:
# Load to db
try:
    conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
    with conn_int.cursor() as cursor_int:
        # Check if the 'liquor_type' column exists
        cursor_int.execute("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_schema = 'INT_HYVEE' 
            AND table_name = 'items' 
            AND column_name = 'liquor_type';
        """)
        # If the column exists, drop it
        if cursor_int.fetchone():
            cursor_int.execute("ALTER TABLE items DROP COLUMN liquor_type;")
        
        # Add the column
        cursor_int.execute("ALTER TABLE items ADD COLUMN liquor_type VARCHAR(255);")

        # Update the col
        sql_update = "UPDATE items SET liquor_type = %s WHERE category_code = %s;"

        for index, row in df.iterrows():
            cursor_int.execute(sql_update, (row['liquor_type'], row['category_code']))

        conn_int.commit()
except pymysql.Error as e:
    print(f"Database error occurred: {e}")
finally:
    conn_int.close()


# 3. Numerical Metrics

In [29]:
import pymysql
import json
import sqlexe

with open('config\mysql_config.json') as f:
  mysql_config = json.load(f)

connection_params = {
    'host': mysql_config['hostname'],
    'user': mysql_config['username'],
    'password': mysql_config['password'],
}

sqlexe.execute_sql_file('metrics.sql', connection_params)