In [1]:
# !pip install --upgrade pip
# !pip install pandas
# !pip install sodapy

# Data Profiling

Analyzing a subset of data to understand its characteristics, guiding the design of an efficient data pipeline.

## Data Extraction

In [2]:
import pandas as pd
from sodapy import Socrata
import json
from datetime import datetime, timedelta

In [3]:
with open('config.json') as f:
  config = json.load(f)

AppToken = config['app_token']
UserName = config['user_name']
Password = config["password"]

In [4]:
client = Socrata("data.iowa.gov",
                 AppToken,
                 username = UserName,
                 password = Password,
                 timeout=30)

In [5]:
col_selected = 'invoice_line_no, date, store, name, city, zipcode, county, category, category_name, vendor_no, vendor_name, itemno, im_desc, state_bottle_cost, state_bottle_retail, sale_bottles'
col_list = ['invoice_line_no', 'date', 'store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'itemno', 'im_desc', 'state_bottle_cost', 'state_bottle_retail', 'sale_bottles']
len(col_list)

14

In [6]:
start_date = "2023-06-01"
end_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]  

total_rows = []  
limit = 5000  
offset = 0  
more_data = True  

while more_data:
    results = client.get("m3tr-qhgy",
                         select=col_selected, 
                         where=f"LOWER(name) LIKE '%hy-vee%' AND date >= '{start_date}' AND date < '{end_date}'", 
                         limit=limit, 
                         offset=offset)
    total_rows.extend(results)  

    if len(results) < limit:
        more_data = False
    else:
        offset += limit 

df = pd.DataFrame.from_records(total_rows)
print(f"Rows from {start_date} to {end_date}: {df.shape[0]}")

Rows from 2023-06-01 to 2024-01-03T19:24:32.622: 425024


In [7]:
df = df.sample(frac=0.1)
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles
266818,INV-63325400057,2023-10-16T00:00:00.000,2552,HY-VEE FOOD STORE #3 (1056) / CEDAR RAPIDS,CEDAR RAPIDS,52404,LINN,1031100,AMERICAN VODKAS,633,TENN SOUTH DISTILLERY / BIG MACHINE DISTILLERY,38135,BIG MACHINE VODKA,19.0,28.5,6
287567,INV-63666300036,2023-10-26T00:00:00.000,2549,HY-VEE FOOD STORE (1271) / INDIANOLA,INDIANOLA,50125,WARREN,1032200,IMPORTED FLAVORED VODKA,65,JIM BEAM BRANDS,35626,PINNACLE WHIPPED,6.5,9.75,12
328051,INV-64355900069,2023-11-16T00:00:00.000,2571,HY-VEE FOOD STORE #2 / WATERLOO,WATERLOO,50703,BLACK HAWK,1031100,AMERICAN VODKAS,434,LUXCO INC,36308,HAWKEYE VODKA,7.5,11.25,6
393142,INV-65386500130,2023-12-18T00:00:00.000,2648,HY-VEE #4 / WDM,WEST DES MOINES,50265,POLK,1081200,CREAM LIQUEURS,482,"SOVEREIGN BRANDS, LLC",68080,BUMBU CREME,18.5,27.75,6
202278,INV-62129100100,2023-09-11T00:00:00.000,2616,HY-VEE FOOD AND DRUG / CLINTON,CLINTON,52732,CLINTON,1082100,IMPORTED CORDIALS & LIQUEURS,368,PARK STREET IMPORTS,65426,LICOR 43,14.5,21.75,12


## Data Transformation

### 1.1. Removing Duplicate Rows

In [8]:
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 6; Percentage of duplicate rows: 0.01%


In [9]:
# Despite the potential for duplicates arising from reasons other than data entry errors, 
# I'm opting to remove them as they constitute a minimal proportion of the dataset (only 0.02%). 
# This decision is based on the assumption that the impact on data integrity will be negligible while simplifying further analysis.

df = df.drop_duplicates()
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 0; Percentage of duplicate rows: 0.00%


### 1.2. Handelling Missing Values

In [10]:
def missing_values_table(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input is not a pandas DataFrame")

    num_na = df.isnull().sum()
    perc_na = (num_na / len(df)) * 100

    df_na = pd.DataFrame({
        'Missing Values': num_na,
        '% of Total Values': perc_na
    })

    df_na = df_na[df_na['Missing Values'] != 0]

    return df_na

missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


Although the current sample shows no missing values, they may be underrepresented in the full dataset. 

These rows, containing key sales data, are vital for our analysis. 
Hence, we will retain them by imputing missing values with placeholders to preserve critical insights. 

However, records lacking essential data such as price, sales volume, dates, and index will be excluded, 
as their absence renders them unsuitable for meaningful analysis.

In [11]:
df.columns

Index(['invoice_line_no', 'date', 'store', 'name', 'city', 'zipcode', 'county',
       'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno',
       'im_desc', 'state_bottle_cost', 'state_bottle_retail', 'sale_bottles'],
      dtype='object')

In [12]:
dimension_cols = ['store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno', 'im_desc']

In [13]:
def generate_placeholder_dict(df, cols):
    placeholder_dict = {}
    for col in cols:
        try:
            int_col = df[col].dropna().astype(int)
            mode_digit_count = int_col.astype(str).apply(len).mode()[0]
            placeholder_dict[col] = int('9' * mode_digit_count)
        except ValueError:
            placeholder_dict[col] = 'UNKNOWN'
    return placeholder_dict

dimension_cols = ['store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno', 'im_desc']
placeholders = generate_placeholder_dict(df, dimension_cols)

placeholders

{'store': 9999,
 'name': 'UNKNOWN',
 'city': 'UNKNOWN',
 'zipcode': 99999,
 'county': 'UNKNOWN',
 'category': 9999999,
 'category_name': 'UNKNOWN',
 'vendor_no': 999,
 'vendor_name': 'UNKNOWN',
 'itemno': 99999,
 'im_desc': 'UNKNOWN'}

In [14]:
# Save dictionary to a JSON file
with open('dicts/placeholders.json', 'w') as file:
    json.dump(placeholders, file)

In [15]:
df = df.fillna(value=placeholders)
missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


### 2.3. Checking Data Type

Verify if specific columns, such as indexes, are convertible to integers and if the date column can be formatted as datetime for compatibility with corresponding MySQL server data types

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42496 entries, 266818 to 308172
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   invoice_line_no      42496 non-null  object
 1   date                 42496 non-null  object
 2   store                42496 non-null  object
 3   name                 42496 non-null  object
 4   city                 42496 non-null  object
 5   zipcode              42496 non-null  object
 6   county               42496 non-null  object
 7   category             42496 non-null  object
 8   category_name        42496 non-null  object
 9   vendor_no            42496 non-null  object
 10  vendor_name          42496 non-null  object
 11  itemno               42496 non-null  object
 12  im_desc              42496 non-null  object
 13  state_bottle_cost    42496 non-null  object
 14  state_bottle_retail  42496 non-null  object
 15  sale_bottles         42496 non-null  object
dtypes: 

#### 2.3.1. Datetime Columns

In [17]:
try:
    pd.to_datetime(df['date'])
    print("The 'date' column is in a suitable format for SQL date type.")
except (ValueError, TypeError):
    print("The 'date' column is NOT in a suitable format for SQL date type.")

The 'date' column is in a suitable format for SQL date type.


#### 2.3.2. Integer Columns

In [23]:
num_col_dtype_map = {}

for col in df.columns:
    if df[col].dtype == 'object' and col != 'date':
        converted_col = pd.to_numeric(df[col], errors='coerce')
        if not converted_col.isna().any():
            num_col_dtype_map[col] = 'int'
            print(f"The column '{col}' can be fully converted to integers.")
        else:
            print(f"The column '{col}' cannot be fully converted to integers.")

print(num_col_dtype_map)

The column 'invoice_line_no' cannot be fully converted to integers.
The column 'store' can be fully converted to integers.
The column 'name' cannot be fully converted to integers.
The column 'city' cannot be fully converted to integers.
The column 'zipcode' can be fully converted to integers.
The column 'county' cannot be fully converted to integers.
The column 'category' can be fully converted to integers.
The column 'category_name' cannot be fully converted to integers.
The column 'vendor_no' can be fully converted to integers.
The column 'vendor_name' cannot be fully converted to integers.
The column 'itemno' can be fully converted to integers.
The column 'im_desc' cannot be fully converted to integers.
{'store': 'int', 'zipcode': 'int', 'category': 'int', 'vendor_no': 'int', 'itemno': 'int'}


In [24]:
# Save dictionary to a JSON file
with open('dicts/num_col_dtype_map.json', 'w') as file:
    json.dump(placeholders, file)

### 2.4. Data Validation

In [19]:
def convert_to_numeric(df, col):
    # Convert column to float first
    df[col] = df[col].astype(float)
    # If the float value is equal to its integer conversion, change to int
    df[col] = df[col].apply(lambda x: int(x) if x == int(x) else x)

columns_to_convert = ['state_bottle_cost', 'state_bottle_retail', 'sale_bottles']
for col in columns_to_convert:
    convert_to_numeric(df, col)

In [20]:
df.describe()

Unnamed: 0,state_bottle_cost,state_bottle_retail,sale_bottles
count,42496.0,42496.0,42496.0
mean,13.735685,20.606807,11.876883
std,18.69077,28.037032,43.054261
min,0.95,1.43,-48.0
25%,7.17,10.76,3.0
50%,10.5,15.75,6.0
75%,16.0,24.0,12.0
max,2298.84,3448.26,5280.0


In [21]:
# Check the percentage of rows where 'sale_bottles' is negative
len(df[df['sale_bottles']<0])/len(df) * 100

0.16001506024096385

**Note**: While negative sales data typically indicate refunds or returns, our analysis will omit these records to maintain data integrity. 

This decision is based on the focus of our study, which requires consistent and positive sales figures to accurately assess trends and patterns.


In [22]:
df=df[df['sale_bottles']>=0]
df.describe()

Unnamed: 0,state_bottle_cost,state_bottle_retail,sale_bottles
count,42428.0,42428.0,42428.0
mean,13.731429,20.600426,11.908362
std,18.701011,28.052396,43.080331
min,0.95,1.43,1.0
25%,7.17,10.76,3.0
50%,10.5,15.75,6.0
75%,16.0,24.0,12.0
max,2298.84,3448.26,5280.0
