# Feature Engineering

In [58]:
import pandas as pd
import json
from datetime import datetime, timedelta
import pymysql

In [79]:
# MySQL connection settings
with open('config\mysql_config.json') as f:
  mysql_config = json.load(f)

host = mysql_config['hostname']
user = mysql_config['username']
password = mysql_config['password']

## 1. Store Format

Creating a new column to indicate store formats based on the 'name' column.

In [80]:
# Loading data from MySQL
conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

sql_query = "SELECT store_id, store_name FROM stores"

df = pd.read_sql(sql_query, conn_int)

cursor_int.close()
conn_int.close()

  df = pd.read_sql(sql_query, conn_int)


In [61]:
df

Unnamed: 0,store_id,store_name


In [62]:
df['store_format'] = df['store_name']
df[df['store_format'].str.contains('/', na=False)]['store_format'].unique()

array([], dtype=object)

The substring following the '/' in each entry of the 'store_format' column appears to denote location information. 

This detail is not relevant for identifying the store format itself. 

Consequently, we will focus on the data preceding the '/' for a more accurate characterization of store formats.

In [63]:
# Stripe 'HY-VEE' and content after first '/'or '-'
df['store_format'] = df['store_format'].str.replace('HY-VEE', 'HY VEE', case=False) \
                                        .str.replace('C-STORE', 'C STORE', case=False) \
                                        .str.split('/|-', n=1).str[0].str.strip()
df['store_format'].unique()

array([], dtype=object)

In [64]:
# Removing all numbers and symbols
df['store_format'] = df['store_format'].str.replace('[0-9#()]+', '', regex=True)
df['store_format'].unique()

array([], dtype=object)

In [65]:
# Remove leading and trailing spaces
df['store_format'] = df['store_format'].str.strip()

# Replace consecutive spaces with a single space
df['store_format'] = df['store_format'].str.replace('\s+', ' ', regex=True)

df['store_format'].unique()

array([], dtype=object)

In [66]:
# Standardizing the names
replacements = {
    'FOOD STORE'          : 'Grocery Store',
    'FOOD AND DRUG'       : 'Pharmacy',
    'DRUGSTORE'           : 'Pharmacy',
    'FOOD & DRUGSTORE'    : 'Pharmacy',
    'WINE AND SPIRITS'    : 'Liquor Store',
    'WINE & SPIRITS'      : 'Liquor Store',
    'FAST & FRESH EXPRESS': 'Convenience Store',
    'FAST & FRESH'        : 'Convenience Store',
    'FAST AND FRESH'      : 'Convenience Store',
    'C STORE'             : 'Convenience Store',
}

with open('dicts/store_format_map.json', 'w') as file:
    json.dump(replacements, file)

In [67]:
for key, value in replacements.items():
    df.loc[df['store_format'].str.contains(key, na=False, case=False), 'store_format'] = value

df['store_format'].unique()

array([], dtype=object)

In [68]:
# Replace 'HY VEE' by 'Grocery Store' and set 'Other' for all values not in the dictionary
df['store_format'] = df['store_format'].apply(lambda x: 'Grocery Store' if x == 'HY VEE' else ('Other' if x not in replacements.values() else x))
df['store_format'].unique()

array([], dtype=object)

In [69]:
df.head()

Unnamed: 0,store_id,store_name,store_format


In [70]:
# Streamline the process for building pipeline
df['store_format'] = df['store_name']

df['store_format'] = (
    df['store_format'].str.replace('HY-VEE', 'HY VEE', case=False, regex=True)
                      .str.replace('C-STORE', 'C STORE', case=False, regex=True)
                      .str.split('/|-', n=1).str[0]
                      .str.replace('[0-9#()]+', '', regex=True)
                      .str.strip()
                      .str.replace('\s+', ' ', regex=True)
)

with open('dicts/store_format_map.json', 'r') as f:
    store_format_map = json.load(f)

# Convert keys to lowercase for case-insensitive matching
replacements_lower = {k.lower(): v for k, v in store_format_map.items()}

# Use a function to replace based on the lowercase match
def apply_replacements(x):
    x_lower = x.lower()
    for key, value in replacements_lower.items():
        if key in x_lower:
            return value
    if x == 'HY VEE':
        return 'Grocery Store'
    return 'Other' if all(x_lower != val.lower() for val in store_format_map.values()) else x

df['store_format'] = df['store_format'].apply(apply_replacements)

In [None]:
# Load to db

In [71]:
df

Unnamed: 0,store_id,store_name,store_format


## 2. Liquor Type

Creating a new column to indicate liquor types based on the 'category' and 'category_code' columns.

In [72]:
conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

sql_query = "SELECT category_code, category FROM items"

df = pd.read_sql(sql_query, conn_int)

cursor_int.close()
conn_int.close()

  df = pd.read_sql(sql_query, conn_int)


In [73]:
df.drop_duplicates().sort_values(by='category_code', ascending=True)

Unnamed: 0,category_code,category


In [74]:
# Matching category codes with liquor types
liquor_type_map = {
    '101': 'Whiskey',
    '102': 'Tequila',
    '103': 'Vodka',
    '104': 'Gin',
    '105': 'Brandy',
    '106': 'Rum',
    '107': 'Cocktails',
    '108': 'Liqueur',
    '109': 'Neutral Grain/Distilled Spirits'
}

with open('dicts/liquor_type_map.json', 'w') as file:
    json.dump(liquor_type_map, file)

In [75]:
df['liquor_type'] = df['category']
df

Unnamed: 0,category_code,category,liquor_type


In [76]:
def match_liquor_type(number):
    number_str = str(number)[:3] 
    return liquor_type_map.get(number_str, 'Other') 

df['liquor_type'] = df['category_code'].apply(match_liquor_type)

In [77]:
df

Unnamed: 0,category_code,category,liquor_type


In [None]:
# Load to db

# 3. Numerical Metrics

In [78]:
def execute_sql_file(filename, connection_params):
    # Initialize the database connection
    db_connection = pymysql.connect(**connection_params)
    cursor = db_connection.cursor()
    
    # Read the SQL script file
    with open(filename, 'r') as sql_file:
        sql_script = sql_file.read()
    
    sql_commands = sql_script.split(';')
    
    for command in sql_commands:
        if command.strip():  # Skip any empty commands resulting from the split
            try:
                cursor.execute(command)
                db_connection.commit()
            except Exception as e:
                # Optional: log the error or handle it otherwise
                print(f"Error executing command: {command}\n{e}")

    cursor.close()
    db_connection.close()

# MySQL connection settings
connection_params = {
    'host': mysql_config['hostname'],
    'user': mysql_config['username'],
    'password': mysql_config['password'],
}

execute_sql_file('Metrics.sql', connection_params)

Error executing command: 

-- Total volume of liquor ordered in gallons
ALTER TABLE transactions
ADD COLUMN sale_gallons DECIMAL(10, 2)
(1060, "Duplicate column name 'sale_gallons'")
Error executing command: 

-- Total volume of liquor ordered in liters
ALTER TABLE transactions
ADD COLUMN sale_liters DECIMAL(10, 2)
(1060, "Duplicate column name 'sale_liters'")
Error executing command: 

-- Gross profit per bottle
ALTER TABLE transactions
ADD COLUMN bottle_profit DECIMAL(10, 2)
(1060, "Duplicate column name 'bottle_profit'")
Error executing command: 

-- Sales revenue
ALTER TABLE transactions
ADD COLUMN revenue DECIMAL(10, 2)
(1060, "Duplicate column name 'revenue'")
Error executing command: 

-- Total Gross Profit
ALTER TABLE transactions
ADD COLUMN profit DECIMAL(10, 2)
(1060, "Duplicate column name 'profit'")
Error executing command: 

-- Profit margin
ALTER TABLE transactions
ADD COLUMN margin DECIMAL(10, 2)
(1060, "Duplicate column name 'margin'")
Error executing command: 

-- Gros