In [2]:
# !pip install --upgrade pip
# !pip install pandas
# !pip install sodapy

# Data Profiling

Analyzing a subset of data to understand its characteristics, guiding the design of an efficient data pipeline.

## Data Extraction

In [3]:
import pandas as pd
from sodapy import Socrata
import json
from datetime import datetime, timedelta

In [4]:
with open('config/socrata_config.json') as f:
  config = json.load(f)

AppToken = config['app_token']
UserName = config['user_name']
Password = config["password"]

In [5]:
client = Socrata("data.iowa.gov",
                 AppToken,
                 username = UserName,
                 password = Password,
                 timeout=30)

In [6]:
col_selected = 'invoice_line_no, date, store, name, city, zipcode, county, category, category_name, vendor_no, vendor_name, itemno, im_desc, state_bottle_cost, state_bottle_retail, sale_bottles'
col_list = ['invoice_line_no', 'date', 'store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'itemno', 'im_desc', 'state_bottle_cost', 'state_bottle_retail', 'sale_bottles']
len(col_list)

14

In [7]:
start_date = "2023-06-01"
end_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]  

total_rows = []  
limit = 5000  
offset = 0  
more_data = True  

while more_data:
    results = client.get("m3tr-qhgy",
                         select=col_selected, 
                         where=f"LOWER(name) LIKE '%hy-vee%' AND date >= '{start_date}' AND date < '{end_date}'", 
                         limit=limit, 
                         offset=offset)
    total_rows.extend(results)  

    if len(results) < limit:
        more_data = False
    else:
        offset += limit 

df = pd.DataFrame.from_records(total_rows)
print(f"Rows from {start_date} to {end_date}: {df.shape[0]}")

Rows from 2023-06-01 to 2024-01-03T20:05:43.711: 425024


In [8]:
df = df.sample(frac=0.1)
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles
248698,INV-62992000002,2023-10-05T00:00:00.000,2621,HY-VEE FOOD STORE #3 / SIOUX CITY,SIOUX CITY,51105,WOODBURY,1012300,SINGLE MALT SCOTCH,363,PATERNO IMPORTS LTD / TERLATO WINES INTERNATIONAL,994652,HA BUNNAHABHAIN 18YR SINGLE MALT SCOTCH,187.5,281.25,1
375718,INV-65117600035,2023-12-11T00:00:00.000,2614,HY-VEE #3 FOOD & DRUGSTORE / DAVENPORT,DAVENPORT,52807,SCOTT,1011200,STRAIGHT BOURBON WHISKIES,65,JIM BEAM BRANDS,19096,JIM BEAM PET,11.0,16.5,12
134500,INV-60994300050,2023-08-04T00:00:00.000,2543,HY-VEE FOOD STORE #1 (1504) / OTTUMWA,OTTUMWA,52501,WAPELLO,1071000,COCKTAILS/RTD,395,PROXIMO,58876,JOSE CUERVO GOLDEN MARGARITA,6.42,9.63,12
385694,INV-65265200081,2023-12-14T00:00:00.000,2633,HY-VEE #3 / BDI / DES MOINES,DES MOINES,50320,POLK,1031100,AMERICAN VODKAS,777,"BLUE OX SPIRITS, INC.",36874,BLUE OX VODKA,4.76,7.14,120
362321,INV-64910900017,2023-12-05T00:00:00.000,2509,HY-VEE / DRUGTOWN #1 (7020) / CEDAR RAPIDS,CEDAR RAPIDS,52404,LINN,1011200,STRAIGHT BOURBON WHISKIES,421,SAZERAC COMPANY INC,21598,TEN HIGH,9.24,13.86,12


## Data Transformation

### 1.1. Removing Duplicate Rows

In [9]:
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 2; Percentage of duplicate rows: 0.00%


In [10]:
# Despite the potential for duplicates arising from reasons other than data entry errors, 
# I'm opting to remove them as they constitute a minimal proportion of the dataset (only 0.02%). 
# This decision is based on the assumption that the impact on data integrity will be negligible while simplifying further analysis.

df = df.drop_duplicates()
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 0; Percentage of duplicate rows: 0.00%


### 1.2. Handelling Missing Values

In [11]:
def missing_values_table(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input is not a pandas DataFrame")

    num_na = df.isnull().sum()
    perc_na = (num_na / len(df)) * 100

    df_na = pd.DataFrame({
        'Missing Values': num_na,
        '% of Total Values': perc_na
    })

    df_na = df_na[df_na['Missing Values'] != 0]

    return df_na

missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


Although the current sample shows no missing values, they may be underrepresented in the full dataset. 

These rows, containing key sales data, are vital for our analysis. 
Hence, we will retain them by imputing missing values with placeholders to preserve critical insights. 

However, records lacking essential data such as price, sales volume, dates, and index will be excluded, 
as their absence renders them unsuitable for meaningful analysis.

In [12]:
df.columns

Index(['invoice_line_no', 'date', 'store', 'name', 'city', 'zipcode', 'county',
       'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno',
       'im_desc', 'state_bottle_cost', 'state_bottle_retail', 'sale_bottles'],
      dtype='object')

In [13]:
dimension_cols = ['store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno', 'im_desc']

In [14]:
def generate_placeholder_dict(df, cols):
    placeholder_dict = {}
    for col in cols:
        try:
            int_col = df[col].dropna().astype(int)
            mode_digit_count = int_col.astype(str).apply(len).mode()[0]
            placeholder_dict[col] = int('9' * mode_digit_count)
        except ValueError:
            placeholder_dict[col] = 'UNKNOWN'
    return placeholder_dict

dimension_cols = ['store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno', 'im_desc']
placeholders = generate_placeholder_dict(df, dimension_cols)

placeholders

{'store': 9999,
 'name': 'UNKNOWN',
 'city': 'UNKNOWN',
 'zipcode': 99999,
 'county': 'UNKNOWN',
 'category': 9999999,
 'category_name': 'UNKNOWN',
 'vendor_no': 999,
 'vendor_name': 'UNKNOWN',
 'itemno': 99999,
 'im_desc': 'UNKNOWN'}

In [15]:
# Save dictionary to a JSON file
with open('dicts/placeholders.json', 'w') as file:
    json.dump(placeholders, file)

In [16]:
df = df.fillna(value=placeholders)
missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


### 2.3. Checking Data Type

Verify if specific columns, such as indexes, are convertible to integers and if the date column can be formatted as datetime for compatibility with corresponding MySQL server data types

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42500 entries, 248698 to 221050
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   invoice_line_no      42500 non-null  object
 1   date                 42500 non-null  object
 2   store                42500 non-null  object
 3   name                 42500 non-null  object
 4   city                 42500 non-null  object
 5   zipcode              42500 non-null  object
 6   county               42500 non-null  object
 7   category             42500 non-null  object
 8   category_name        42500 non-null  object
 9   vendor_no            42500 non-null  object
 10  vendor_name          42500 non-null  object
 11  itemno               42500 non-null  object
 12  im_desc              42500 non-null  object
 13  state_bottle_cost    42500 non-null  object
 14  state_bottle_retail  42500 non-null  object
 15  sale_bottles         42500 non-null  object
dtypes: 

#### 2.3.1. Datetime Columns

In [18]:
try:
    pd.to_datetime(df['date'])
    print("The 'date' column is in a suitable format for SQL date type.")
except (ValueError, TypeError):
    print("The 'date' column is NOT in a suitable format for SQL date type.")

The 'date' column is in a suitable format for SQL date type.


#### 2.3.2. Integer Columns

In [19]:
num_col_dtype_map = {}

for col in df.columns:
    if df[col].dtype == 'object' and col != 'date':
        converted_col = pd.to_numeric(df[col], errors='coerce')
        if not converted_col.isna().any():
            num_col_dtype_map[col] = 'int'
            print(f"The column '{col}' can be fully converted to integers.")
        else:
            print(f"The column '{col}' cannot be fully converted to integers.")

print(num_col_dtype_map)

The column 'invoice_line_no' cannot be fully converted to integers.
The column 'store' can be fully converted to integers.
The column 'name' cannot be fully converted to integers.
The column 'city' cannot be fully converted to integers.
The column 'zipcode' can be fully converted to integers.
The column 'county' cannot be fully converted to integers.
The column 'category' can be fully converted to integers.
The column 'category_name' cannot be fully converted to integers.
The column 'vendor_no' can be fully converted to integers.
The column 'vendor_name' cannot be fully converted to integers.
The column 'itemno' can be fully converted to integers.
The column 'im_desc' cannot be fully converted to integers.
The column 'state_bottle_cost' can be fully converted to integers.
The column 'state_bottle_retail' can be fully converted to integers.
The column 'sale_bottles' can be fully converted to integers.
{'store': 'int', 'zipcode': 'int', 'category': 'int', 'vendor_no': 'int', 'itemno': 'i

In [20]:
# Save dictionary to a JSON file
with open('dicts/num_col_dtype_map.json', 'w') as file:
    json.dump(num_col_dtype_map, file)

### 2.4. Data Validation

In [21]:
def convert_to_numeric(df, col):
    # Convert column to float first
    df[col] = df[col].astype(float)
    # If the float value is equal to its integer conversion, change to int
    df[col] = df[col].apply(lambda x: int(x) if x == int(x) else x)

columns_to_convert = ['state_bottle_cost', 'state_bottle_retail', 'sale_bottles']
for col in columns_to_convert:
    convert_to_numeric(df, col)

In [22]:
df.describe()

Unnamed: 0,state_bottle_cost,state_bottle_retail,sale_bottles
count,42500.0,42500.0,42500.0
mean,13.670751,20.508021,11.857271
std,12.200608,18.301018,35.332968
min,0.95,1.43,-96.0
25%,7.35,11.03,3.0
50%,10.99,16.49,6.0
75%,16.0,24.0,12.0
max,300.0,450.0,1320.0


In [23]:
# Check the percentage of rows where 'sale_bottles' is negative
len(df[df['sale_bottles']<0])/len(df) * 100

0.16470588235294117

**Note**: While negative sales data typically indicate refunds or returns, our analysis will omit these records to maintain data integrity. 

This decision is based on the focus of our study, which requires consistent and positive sales figures to accurately assess trends and patterns.


In [24]:
df=df[df['sale_bottles']>=0]
df.describe()

Unnamed: 0,state_bottle_cost,state_bottle_retail,sale_bottles
count,42430.0,42430.0,42430.0
mean,13.671032,20.508442,11.896913
std,12.198947,18.298527,35.342474
min,0.95,1.43,1.0
25%,7.35,11.03,3.0
50%,10.99,16.49,6.0
75%,16.0,24.0,12.0
max,300.0,450.0,1320.0
