In [1]:
import pandas as pd
from sodapy import Socrata
import json
# from datetime import datetime, timedelta
import datetime
import pymysql

In [2]:
# Socrata API 
with open('config\socrata_config.json') as f:
  socrata_config = json.load(f)

AppToken = socrata_config['app_token']
UserName = socrata_config['user_name']
Password = socrata_config["password"]

client = Socrata("data.iowa.gov",
                 AppToken,
                 username = UserName,
                 password = Password,
                 timeout=30)


In [3]:
# MySQL connection settings
with open('config\mysql_config.json') as f:
  mysql_config = json.load(f)

host = mysql_config['hostname']
user = mysql_config['username']
password = mysql_config['password']

In [4]:
# Placeholder and data type conversion dictionaries
with open('dicts/placeholders.json', 'r') as f:
    placeholders = json.load(f)

with open('dicts/num_col_dtype_map.json', 'r') as f:
    num_col_dtype_map = json.load(f)

In [5]:
# Function for extracting data via Socrata API
def extract_data(client, year, month, batch_size):
    start_date = f"{year}-{month}-01T00:00:00.000"
    if month == 12:
        end_date = f"{year + 1}-01-01T00:00:00.000"
    else:
        end_date = f"{year}-{month + 1}-01T00:00:00.000"
    offset = 0
    while True:
        results = client.get("m3tr-qhgy",
                             select=col_selected,
                             where=f"(LOWER(name) LIKE '%hy-vee%' OR name LIKE '%WALL TO WALL WINE AND SPIRITS%') AND date >= '{start_date}' AND date < '{end_date}'",
                             limit=batch_size,
                             offset=offset)
        if results:
            yield results
            offset += len(results)
        else:
            break

In [6]:
# Function for transforming data 
def transform_data(df):
    # df = df.drop_duplicates()
    df.fillna(placeholders, inplace=True)
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    for col, col_type in num_col_dtype_map.items():
        if col_type == 'int':
            df[col] = df[col].astype(float).astype(int)
        else:
            df[col] = df[col].astype(float)
    df = df[(df['state_bottle_cost'] > 0) & (df['state_bottle_retail'] > 0) & (df['sale_bottles'] > 0)]
    return df

In [7]:
# Function for loading data to a MySQL database
def load_data(conn, cursor, df, batch_size, sql_insert_query):
    data_tuples = list(df.itertuples(index=False, name=None))
    for batch in [data_tuples[i:i + batch_size] for i in range(0, len(data_tuples), batch_size)]:
        for row in batch:
            try:
                cursor.execute(sql_insert_query, row)
            except pymysql.err.IntegrityError as e:
                if 'Duplicate entry' in str(e):
                    # Log the error and skip the duplicated row
                    print(f"Duplicate entry skipped: {row}")
                    continue
                else:
                    raise
        conn.commit()

In [8]:
load_sql = """
INSERT INTO sales (
    invoice_line_no, date, store, name, address, city, zipcode, county, category, category_name, vendor_no, vendor_name, itemno, im_desc, bottle_volume_ml, state_bottle_cost, state_bottle_retail, sale_bottles
) 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

In [9]:
col_selected = 'invoice_line_no, date, store, name, address, city, zipcode, county, category, category_name, vendor_no, vendor_name, itemno, im_desc, bottle_volume_ml, state_bottle_cost, state_bottle_retail, sale_bottles'


In [11]:
# Start ETL procedure

# Establish connections to both STG_HYVEE and INT_HYVEE databases
conn_stg = pymysql.connect(host=host, user=user, password=password, db='STG_HYVEE')
cursor_stg = conn_stg.cursor()

conn_int = pymysql.connect(host=host, user=user, password=password, db='INT_HYVEE')
cursor_int = conn_int.cursor()

cursor_stg.execute("SELECT MAX(date) FROM sales")
latest_date_result = cursor_stg.fetchone()

today = datetime.datetime.now()

batch_size = 8000

try:
    # If records exist in the database, load the data following the most recent entry
    if latest_date_result and latest_date_result[0]:
        latest_date = latest_date_result[0]
        if latest_date.month == 12:
            start_year = latest_date.year + 1
            start_month = 1
        else:
            start_year = latest_date.year
            start_month = latest_date.month + 1

        end_year = today.year
        end_month = today.month

    # If the database contains no records, then load data starting from 3 fiscal years ago
    else: 
        # today.month < [the month fiscal year starts]
        if today.month < 7:
            start_year = today.year - 1 # test: deleted "-4"
        else:
            start_year = today.year - 3

        start_month = 7
        end_year = today.year
        end_month = today.month - 1 # added "-1"
        
    for year in range(start_year, end_year + 1):
        for month in range(start_month if year == start_year else 1, end_month if year == end_year else 13):
            print(f'----- Batch: {year}-{month} -----')
            for results in extract_data(client, year, month, batch_size):
                df = pd.DataFrame.from_records(results)
                df_transformed = transform_data(df)
                load_data(conn_stg, cursor_stg, df_transformed, batch_size, load_sql)
                load_data(conn_int, cursor_int, df_transformed, batch_size, load_sql)
finally:
    cursor_stg.close()
    conn_stg.close()
    cursor_int.close()
    conn_int.close()

----- Batch: 2024-1 -----
Duplicate entry skipped: ('INV-66363200016', '2024-01-17', 2512, 'HY-VEE WINE AND SPIRITS #1 (1281) / IOWA CITY', '1720 WATERFRONT DR', 'IOWA CITY', 52240, 'JOHNSON', 1081300, 'AMERICAN CORDIALS & LIQUEURS', 305, 'MHW LTD', 65047, 'DORDA SEA SALT CARAMEL LIQUEUR', 750, 11.0, 16.5, 6)
Duplicate entry skipped: ('INV-66363200016', '2024-01-17', 2512, 'HY-VEE WINE AND SPIRITS #1 (1281) / IOWA CITY', '1720 WATERFRONT DR', 'IOWA CITY', 52240, 'JOHNSON', 1081300, 'AMERICAN CORDIALS & LIQUEURS', 305, 'MHW LTD', 65047, 'DORDA SEA SALT CARAMEL LIQUEUR', 750, 11.0, 16.5, 6)
Duplicate entry skipped: ('INV-66785600084', '2024-01-31', 2647, 'HY-VEE #7 / CEDAR RAPIDS', '5050 EDGEWOOD RD', 'CEDAR RAPIDS', 52411, 'LINN', 1082100, 'IMPORTED CORDIALS & LIQUEURS', 305, 'MHW LTD', 65111, "GALLIANO L'AUTENTICO", 375, 10.75, 16.13, 3)
Duplicate entry skipped: ('INV-66785600084', '2024-01-31', 2647, 'HY-VEE #7 / CEDAR RAPIDS', '5050 EDGEWOOD RD', 'CEDAR RAPIDS', 52411, 'LINN', 108210

In [None]:
# Testing date realted loop

# today = datetime.datetime.now()
today = datetime.datetime.strptime('2024-02-20', '%Y-%m-%d')

batch_size = 8000

latest_date_result = None

# If records exist in the database, load the data following the most recent entry
if latest_date_result:
    latest_date = datetime.datetime.strptime(latest_date_result, '%Y-%m-%d')
    if latest_date.month == 12:
        start_year = latest_date.year + 1
        start_month = 1
    else:
        start_year = latest_date.year
        start_month = latest_date.month + 1

    end_year = today.year
    end_month = today.month

# If the database contains no records, then load data starting from 3 fiscal years ago
else: 
    # today.month < [the month fiscal year starts]
    if today.month < 7:
        start_year = today.year - 4
    else:
        start_year = today.year - 3

    start_month = 7
    end_year = today.year
    end_month = today.month - 1
    
for year in range(start_year, end_year + 1):
    for month in range(start_month if year == start_year else 1, end_month + 1 if year == end_year else 13):
        print(f'{year}-{month}')