# Feature Engineering

## 1. Extracting Data

In [1]:
import pandas as pd
import json
from datetime import datetime, timedelta
import pymysql

In [2]:
# MySQL connection settings
with open('config\mysql_config.json') as f:
  mysql_config = json.load(f)

host = mysql_config['hostname']
user = mysql_config['username']
password = mysql_config['password']

conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

In [3]:
# Loading data from MySQL
sql_query = "SELECT invoice_line_no, name, category_name FROM sales"

df = pd.read_sql(sql_query, conn_int)

cursor_int.close()
conn_int.close()

  df = pd.read_sql(sql_query, conn_int)


In [4]:
df.head()

Unnamed: 0,invoice_line_no,name,category_name
0,INV-33169200001,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES
1,INV-33169200002,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES
2,INV-33169200003,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES
3,INV-33169200004,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES
4,INV-33169200005,HY-VEE #3 / BDI / DES MOINES,IRISH WHISKIES


## 2. Feature Engineering

### 2.1. Store Format

Creating a new column to indicate store formats based on the 'name' column.

In [5]:
df['store_format'] = df['name']
df[df['store_format'].str.contains('/', na=False)]['store_format'].unique()

array(['HY-VEE #3 / BDI / DES MOINES',
       'HY-VEE FOOD STORE #3 / DES MOINES',
       'HY-VEE FOOD STORE #5 / DES MOINES',
       'HY-VEE FOOD STORE / FLEUR / DSM', 'HY-VEE #7 / CEDAR RAPIDS',
       'HY-VEE FOOD STORE #3 / CEDAR RAPIDS',
       'HY-VEE FOOD STORE #4 / CEDAR RAPIDS',
       'HY-VEE / DRUGTOWN #1 / CEDAR RAPIDS', 'HY-VEE #4 / WDM',
       'HY-VEE WINE AND SPIRITS / WDM', 'HY-VEE DRUGSTORE #2 / WDM',
       'HY-VEE  /  JEFFERSON', 'HY-VEE FOOD STORE / CARROLL',
       'HY-VEE FOOD STORE / CHEROKEE',
       'HY-VEE WINE AND SPIRITS / STORM LAKE',
       'HY-VEE WINE AND SPIRITS / IOWA CITY',
       'HY-VEE FOOD STORE / CORALVILLE',
       'HY-VEE #3 FOOD & DRUGSTORE / DAVENPORT',
       'HY-VEE DRUGSTORE / DAVENPORT',
       'HY-VEE FOOD STORE #2 / STATE ANKENY',
       'HY-VEE FOOD STORE / IOWA FALLS', 'HY-VEE FOOD STORE / ELDORA',
       'HY-VEE FOOD AND DRUG / CLINTON',
       'HY-VEE FOOD STORE #1 / CEDAR RAPIDS',
       'HY-VEE FOOD STORE / DUBUQUE', 'HY-VEE C-ST

The substring following the '/' in each entry of the 'store_format' column appears to denote location information. 

This detail is not relevant for identifying the store format itself. 

Consequently, we will focus on the data preceding the '/' for a more accurate characterization of store formats.

In [6]:
# Split each string in 'store_format' at the first '/' and keep only the first part
df['store_format'] = df['store_format'].str.split('/', n=1).str[0]
df['store_format'].unique()

array(['HY-VEE #3 ', 'HY-VEE FOOD STORE #3 ', 'HY-VEE FOOD STORE #5 ',
       'HY-VEE FOOD STORE ', 'HY-VEE #7 ', 'HY-VEE FOOD STORE #4 ',
       'HY-VEE ', 'HY-VEE FAST & FRESH - DES MOINES', 'HY-VEE #4 ',
       'HY-VEE WINE AND SPIRITS ', 'HY-VEE DRUGSTORE #2 ',
       'HY-VEE C-STORE - DOUGLAS', 'HY-VEE  ',
       'HY-VEE #3 FOOD & DRUGSTORE ', 'HY-VEE DRUGSTORE ',
       'HY-VEE FOOD STORE #2 ', 'HY-VEE HAMPTON DOLLAR FRESH',
       'HY-VEE FOOD AND DRUG ', 'HY-VEE FOOD STORE #1 ',
       'HY-VEE C-STORE #3 ', 'HY-VEE DYERSVILLE DOLLAR FRESH',
       'HY-VEE WINE & SPIRITS ', 'HY-VEE GAS #3 ',
       'HY-VEE FORT DODGE WINE AND SPIRITS', 'HY-VEE #5 ',
       'HY-VEE C-STORE - EAST HICKMAN', 'HY-VEE OELWEIN DOLLAR FRESH',
       'HY-VEE #2 ', 'HY-VEE MAINSTREET ', 'HY-VEE # 6', 'HY-VEE  #2 ',
       'HY-VEE DRUGSTORE #5 ', 'HY-VEE DRUGSTORE #4 ',
       'HY-VEE C-STORE #2 - ANKENY',
       'HY-VEE FAST AND FRESH - WINDSOR HEIGHTS', 'HY-VEE -GARNER',
       'HY-VEE - FOREST CITY', '

In [7]:
# Removing all numbers and symbols
df['store_format'] = df['store_format'].str.replace('[0-9#()]+', '', regex=True)
df['store_format'].unique()

array(['HY-VEE  ', 'HY-VEE FOOD STORE  ', 'HY-VEE FOOD STORE ', 'HY-VEE ',
       'HY-VEE FAST & FRESH - DES MOINES', 'HY-VEE WINE AND SPIRITS ',
       'HY-VEE DRUGSTORE  ', 'HY-VEE C-STORE - DOUGLAS',
       'HY-VEE  FOOD & DRUGSTORE ', 'HY-VEE DRUGSTORE ',
       'HY-VEE HAMPTON DOLLAR FRESH', 'HY-VEE FOOD AND DRUG ',
       'HY-VEE C-STORE  ', 'HY-VEE DYERSVILLE DOLLAR FRESH',
       'HY-VEE WINE & SPIRITS ', 'HY-VEE GAS  ',
       'HY-VEE FORT DODGE WINE AND SPIRITS',
       'HY-VEE C-STORE - EAST HICKMAN', 'HY-VEE OELWEIN DOLLAR FRESH',
       'HY-VEE MAINSTREET ', 'HY-VEE   ', 'HY-VEE C-STORE  - ANKENY',
       'HY-VEE FAST AND FRESH - WINDSOR HEIGHTS', 'HY-VEE -GARNER',
       'HY-VEE - FOREST CITY', 'HY-VEE DOLLAR FRESH - EMMETSBURG',
       'HY-VEE OTTUMWA', 'HY-VEE WINE & SPIRITS  ',
       'HY-VEE FAST AND FRESH ', 'HY-VEE FOOD AND DRUG  ',
       'HY-VEE FAST & FRESH', 'HY-VEE  FOOD STORE ', 'HY-VEE STORE ',
       'HY-VEE FAST & FRESH EXPRESS ', 'HY-VEE WAUKON DOLLAR FRES

In [8]:
# Remove leading and trailing spaces
df['store_format'] = df['store_format'].str.strip()

# Replace consecutive spaces with a single space
df['store_format'] = df['store_format'].str.replace('\s+', ' ', regex=True)

df['store_format'].unique()

array(['HY-VEE', 'HY-VEE FOOD STORE', 'HY-VEE FAST & FRESH - DES MOINES',
       'HY-VEE WINE AND SPIRITS', 'HY-VEE DRUGSTORE',
       'HY-VEE C-STORE - DOUGLAS', 'HY-VEE FOOD & DRUGSTORE',
       'HY-VEE HAMPTON DOLLAR FRESH', 'HY-VEE FOOD AND DRUG',
       'HY-VEE C-STORE', 'HY-VEE DYERSVILLE DOLLAR FRESH',
       'HY-VEE WINE & SPIRITS', 'HY-VEE GAS',
       'HY-VEE FORT DODGE WINE AND SPIRITS',
       'HY-VEE C-STORE - EAST HICKMAN', 'HY-VEE OELWEIN DOLLAR FRESH',
       'HY-VEE MAINSTREET', 'HY-VEE C-STORE - ANKENY',
       'HY-VEE FAST AND FRESH - WINDSOR HEIGHTS', 'HY-VEE -GARNER',
       'HY-VEE - FOREST CITY', 'HY-VEE DOLLAR FRESH - EMMETSBURG',
       'HY-VEE OTTUMWA', 'HY-VEE FAST AND FRESH', 'HY-VEE FAST & FRESH',
       'HY-VEE STORE', 'HY-VEE FAST & FRESH EXPRESS',
       'HY-VEE WAUKON DOLLAR FRESH', 'HY-VEE FAST & FRESH - KNOXVILLE',
       'HY-VEE GAS - PLEASANT HILL', 'HY-VEE FAST AND FRESH - URBANDALE',
       'HY-VEE GAS - WDM', 'HY-VEE FAST AND FRESH - DUBUQUE',
  

In [9]:
# Standardizing the names
replacements = {
    'FOOD STORE': 'HY-VEE',
    'FOOD AND DRUG': 'HY-VEE FOOD & DRUG',
    'DRUGSTORE': 'HY-VEE FOOD & DRUG',
    'FOOD & DRUGSTORE': 'HY-VEE FOOD & DRUG',
    'WINE AND SPIRITS': 'HY-VEE WINE & SPIRITS',
    'WINE & SPIRITS': 'HY-VEE WINE & SPIRITS',
    'FAST & FRESH EXPRESS': 'HY-VEE FAST & FRESH',
    'FAST & FRESH': 'HY-VEE FAST & FRESH',
    'FAST AND FRESH': 'HY-VEE FAST & FRESH',
    'GAS': 'HY-VEE GAS',
    'GASE': 'HY-VEE GAS',
    'C-STORE': 'HY-VEE C-STORE',
    'C-STRORE': 'HY-VEE C-STORE',
    'C STORE': 'HY-VEE C-STORE',
    'DOLLAR FRESH': 'HY-VEE DOLLAR FRESH'
}

for key, value in replacements.items():
    df.loc[df['store_format'].str.contains(key, na=False, case=False), 'store_format'] = value

df['store_format'].unique()

array(['HY-VEE', 'HY-VEE FAST & FRESH', 'HY-VEE WINE & SPIRITS',
       'HY-VEE FOOD & DRUG', 'HY-VEE C-STORE', 'HY-VEE DOLLAR FRESH',
       'HY-VEE GAS', 'HY-VEE MAINSTREET', 'HY-VEE -GARNER',
       'HY-VEE - FOREST CITY', 'HY-VEE OTTUMWA', 'HY-VEE STORE',
       'HY-VEE WDM HEALTH MARKET'], dtype=object)

In [11]:
format_replacements = {
    'HY-VEE'                : 'Grocery Store',
    'HY-VEE FOOD & DRUG'    : 'Pharmacy',
    'HY-VEE WINE & SPIRITS' : 'Spirits Store',
    'HY-VEE C-STORE'        : 'Convenience Store',
    'HY-VEE FAST & FRESH'   : 'Convenience Store',
    'HY-VEE DOLLAR FRESH'   : 'Dollar Store'
}

# Replace values using the dictionary and set 'Other' for all values not in the dictionary
df['store_format'] = df['store_format'].map(lambda x: format_replacements.get(x, 'Other'))

In [12]:
df['store_format'].unique()

array(['Grocery Store', 'Convenience Store', 'Spirits Store', 'Pharmacy',
       'Dollar Store', 'Other'], dtype=object)

In [14]:
df.head()

Unnamed: 0,invoice_line_no,name,category_name,store_format
0,INV-33169200001,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store
1,INV-33169200002,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store
2,INV-33169200003,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store
3,INV-33169200004,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store
4,INV-33169200005,HY-VEE #3 / BDI / DES MOINES,IRISH WHISKIES,Grocery Store


### 2.2. liquor Type

Creating a new column to indicate liquor types based on the 'category_name' column.

In [15]:
df['liquor_type'] = df['category_name']
df['liquor_type'].unique()

array(['CANADIAN WHISKIES', 'IRISH WHISKIES', 'STRAIGHT RYE WHISKIES',
       'STRAIGHT BOURBON WHISKIES', 'BLENDED WHISKIES',
       'IMPORTED DRY GINS', 'AMERICAN DRY GINS',
       'IMPORTED FLAVORED VODKA', 'IMPORTED VODKAS', 'AMERICAN VODKAS',
       'AMERICAN FLAVORED VODKA', 'FLAVORED RUM', 'WHITE RUM',
       'SPICED RUM', 'AMERICAN BRANDIES', 'SCOTCH WHISKIES',
       'IMPORTED CORDIALS & LIQUEURS', 'CREAM LIQUEURS',
       'IMPORTED SCHNAPPS', 'AMERICAN CORDIALS & LIQUEURS',
       'IMPORTED DISTILLED SPIRITS SPECIALTY',
       'AMERICAN DISTILLED SPIRITS SPECIALTY', 'AMERICAN SCHNAPPS',
       'TENNESSEE WHISKIES', 'TRIPLE SEC', '100% AGAVE TEQUILA',
       'MIXTO TEQUILA', 'MEZCAL', 'WHISKEY LIQUEUR',
       'SPECIAL ORDER ITEMS', 'IMPORTED BRANDIES',
       'TEMPORARY & SPECIALTY PACKAGES', 'COCKTAILS/RTD', 'AGED DARK RUM',
       'NEUTRAL GRAIN SPIRITS', 'GOLD RUM', 'COFFEE LIQUEURS',
       'NEUTRAL GRAIN SPIRITS FLAVORED', 'BOTTLED IN BOND BOURBON',
       'CORN WHISKIES

In [16]:
# Extract the last word
df['liquor_type'] = df['liquor_type'].apply(lambda x: x.split()[-1])
df['liquor_type'].unique()

array(['WHISKIES', 'GINS', 'VODKA', 'VODKAS', 'RUM', 'BRANDIES',
       'LIQUEURS', 'SCHNAPPS', 'SPECIALTY', 'SEC', 'TEQUILA', 'MEZCAL',
       'LIQUEUR', 'ITEMS', 'PACKAGES', 'COCKTAILS/RTD', 'SPIRITS',
       'FLAVORED', 'BOURBON', 'SCOTCH', 'GIN'], dtype=object)

In [17]:
type_replacements = {
    'WHISKIES'  : 'Whisky',
    'GINS'      : 'Gin',
    'VODKA'     : 'Vodka',
    'VODKAS'    : 'Vodka',
    'RUM'       : 'Rum',
    'BRANDIES'  : 'Brandy',
    'LIQUEURS'  : 'Liqueur',
    'SCHNAPPS'  : 'Liqueur',  
    'SEC'       : 'Liqueur',  
    'TEQUILA'   : 'Tequila',
    'LIQUEUR'   : 'Liqueur',
    'BOURBON'   : 'Whisky',
    'SCOTCH'    : 'Whisky',
    'GIN'       : 'Gin'
}

# Replace values using the dictionary and set 'Other' for all values not in the dictionary
df['liquor_type'] = df['liquor_type'].map(lambda x: type_replacements.get(x, 'Other'))

In [18]:
df['liquor_type'].unique()

array(['Whisky', 'Gin', 'Vodka', 'Rum', 'Brandy', 'Liqueur', 'Other',
       'Tequila'], dtype=object)

In [19]:
df.head()

Unnamed: 0,invoice_line_no,name,category_name,store_format,liquor_type
0,INV-33169200001,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store,Whisky
1,INV-33169200002,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store,Whisky
2,INV-33169200003,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store,Whisky
3,INV-33169200004,HY-VEE #3 / BDI / DES MOINES,CANADIAN WHISKIES,Grocery Store,Whisky
4,INV-33169200005,HY-VEE #3 / BDI / DES MOINES,IRISH WHISKIES,Grocery Store,Whisky


# 3. Loading Data

In [32]:
conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

In [21]:
# Adding new columns in MySQL database
conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

add_store_format = "ALTER TABLE sales ADD COLUMN store_format VARCHAR(255);"
add_liquor_type = "ALTER TABLE sales ADD COLUMN liquor_type VARCHAR(255);"

try:
    cursor_int.execute(add_store_format)
    cursor_int.execute(add_liquor_type)
    conn_int.commit()  
    print("Columns added successfully.")
except pymysql.err.InternalError as e:
    print(f"An error occurred: {e}")

Columns added successfully.


In [33]:
# Creating a Temporary Table
def create_temp_table(cursor):
    temp_table_query = """
    CREATE TEMPORARY TABLE IF NOT EXISTS temp_sales_updates (
        invoice_line_no VARCHAR(255),
        store_format VARCHAR(255),
        liquor_type VARCHAR(255),
        INDEX (invoice_line_no)
    );
    """
    cursor.execute(temp_table_query)

create_temp_table(cursor_int)

In [34]:
# Inserting Data into Temporary Table in Batches
def batch_insert_to_temp_table(cursor, batch_data):
    insert_query = """
    INSERT INTO temp_sales_updates (invoice_line_no, store_format, liquor_type) 
    VALUES (%s, %s, %s);
    """
    cursor.executemany(insert_query, batch_data)

batch_size = 1000  # Adjust the batch size according to your requirements
for start in range(0, len(df), batch_size):
    end = start + batch_size
    batch_data = [
        (row['invoice_line_no'], row['store_format'], row['liquor_type']) 
        for index, row in df.iloc[start:end].iterrows()
    ]
    batch_insert_to_temp_table(cursor_int, batch_data)
    conn_int.commit()


In [35]:
# Indexing `invoice_line_no` for optimized performance
def create_index(cursor):
    index_query = """
    CREATE INDEX idx_invoice_line_no ON sales (invoice_line_no); 
    """
    try:
        cursor.execute(index_query)
        print("Index created successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")

create_index(cursor_int)
conn_int.commit()


An error occurred: (1061, "Duplicate key name 'idx_invoice_line_no'")


In [36]:
# Updating Main Table Using Temporary Table
def update_main_table_from_temp(cursor):
    update_query = """
    UPDATE sales s
    JOIN temp_sales_updates t ON s.invoice_line_no = t.invoice_line_no
    SET s.store_format = t.store_format, s.liquor_type = t.liquor_type;
    """
    cursor.execute(update_query)

update_main_table_from_temp(cursor_int)
conn_int.commit()

cursor_int.close()
conn_int.close()