In [49]:
# !pip install --upgrade pip
# !pip install pandas
# !pip install sodapy

# Data Profiling

Analyzing a subset of data to understand its characteristics, guiding the design of an efficient data pipeline.

## Data Extraction

In [50]:
import pandas as pd
from sodapy import Socrata
import json
from datetime import datetime, timedelta

In [51]:
with open('config/socrata_config.json') as f:
  config = json.load(f)

AppToken = config['app_token']
UserName = config['user_name']
Password = config["password"]

In [52]:
client = Socrata("data.iowa.gov",
                 AppToken,
                 username = UserName,
                 password = Password,
                 timeout=30)

In [53]:
col_selected = 'invoice_line_no, date, store, name, city, zipcode, county, category, category_name, vendor_no, vendor_name, itemno, im_desc, state_bottle_cost, state_bottle_retail, sale_bottles'
col_list = ['invoice_line_no', 'date', 'store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'itemno', 'im_desc', 'state_bottle_cost', 'state_bottle_retail', 'sale_bottles']
len(col_list)

14

In [54]:
start_date = "2023-06-01"
end_date = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]  

total_rows = []  
limit = 5000  
offset = 0  
more_data = True  

while more_data:
    results = client.get("m3tr-qhgy",
                         select=col_selected, 
                         where=f"LOWER(name) LIKE '%hy-vee%' AND date >= '{start_date}' AND date < '{end_date}'", 
                         limit=limit, 
                         offset=offset)
    total_rows.extend(results)  

    if len(results) < limit:
        more_data = False
    else:
        offset += limit 

df = pd.DataFrame.from_records(total_rows)
print(f"Rows from {start_date} to {end_date}: {df.shape[0]}")

Rows from 2023-06-01 to 2024-01-03T21:03:10.774: 425024


In [55]:
# df = df.sample(frac=0.1)
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles
0,INV-58975000103,2023-06-01T00:00:00.000,2522,HY-VEE WINE AND SPIRITS (1628) / SPIRIT LAKE,SPIRIT LAKE,51360,DICKINSON,1031200,AMERICAN FLAVORED VODKA,115,CONSTELLATION BRANDS INC,36576,SVEDKA BLUE RASPBERRY,8.0,12.0,2
1,INV-58957400083,2023-06-01T00:00:00.000,2545,HY-VEE DRUGSTORE (7042) / IOWA CITY,IOWA CITY,52245,JOHNSON,1012200,SCOTCH WHISKIES,266,EDRINGTON GROUP USA LLC,4936,FAMOUS GROUSE SCOTCH,12.44,18.66,12
2,INV-58958300072,2023-06-01T00:00:00.000,2517,HY-VEE FOOD STORE #1 (1449) / NEWTON,NEWTON,50208,JASPER,1082100,IMPORTED CORDIALS & LIQUEURS,619,CAMPARI AMERICA,64996,FRANGELICO LIQUEUR,16.5,24.75,2
3,INV-58973600034,2023-06-01T00:00:00.000,2565,HY-VEE FOOD STORE (1636) / SPENCER,SPENCER,51301,CLAY,1011300,SINGLE BARREL BOURBON WHISKIES,594,"AMERICAN HERITAGE DISTILLERS, LLC / CENTURY FA...",26269,CENTURY FARMS PRAIRIE STATES BOURBON,18.6,27.9,6
4,INV-58957700022,2023-06-01T00:00:00.000,2513,HY-VEE FOOD STORE #2 (1285) / IOWA CITY,IOWA CITY,52240,JOHNSON,1041100,AMERICAN DRY GINS,434,LUXCO INC,31658,PARAMOUNT GIN,7.75,11.63,6


## Data Transformation

### 1.1. Removing Duplicate Rows

In [56]:
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 1245; Percentage of duplicate rows: 0.29%


In [57]:
# Despite the potential for duplicates arising from reasons other than data entry errors, 
# I'm opting to remove them as they constitute a minimal proportion of the dataset (only 0.02%). 
# This decision is based on the assumption that the impact on data integrity will be negligible while simplifying further analysis.

df = df.drop_duplicates()
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 0; Percentage of duplicate rows: 0.00%


In [58]:
# Check if invoice_line_no unique
number_of_duplicates = df.duplicated(subset='invoice_line_no', keep=False).sum()
print(f"Number of duplicates in 'invoice_line_no': {number_of_duplicates}")


Number of duplicates in 'invoice_line_no': 0


### 1.2. Handelling Missing Values

In [59]:
def missing_values_table(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input is not a pandas DataFrame")

    num_na = df.isnull().sum()
    perc_na = (num_na / len(df)) * 100

    df_na = pd.DataFrame({
        'Missing Values': num_na,
        '% of Total Values': perc_na
    })

    df_na = df_na[df_na['Missing Values'] != 0]

    return df_na

missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


Although the current sample shows no missing values, they may be underrepresented in the full dataset. 

These rows, containing key sales data, are vital for our analysis. 
Hence, we will retain them by imputing missing values with placeholders to preserve critical insights. 

However, records lacking essential data such as price, sales volume, dates, and index will be excluded, 
as their absence renders them unsuitable for meaningful analysis.

In [60]:
df.columns

Index(['invoice_line_no', 'date', 'store', 'name', 'city', 'zipcode', 'county',
       'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno',
       'im_desc', 'state_bottle_cost', 'state_bottle_retail', 'sale_bottles'],
      dtype='object')

In [61]:
dimension_cols = ['store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno', 'im_desc']

In [62]:
def generate_placeholder_dict(df, cols):
    placeholder_dict = {}
    for col in cols:
        try:
            int_col = df[col].dropna().astype(int)
            mode_digit_count = int_col.astype(str).apply(len).mode()[0]
            placeholder_dict[col] = int('9' * mode_digit_count)
        except ValueError:
            placeholder_dict[col] = 'UNKNOWN'
    return placeholder_dict

dimension_cols = ['store', 'name', 'city', 'zipcode', 'county', 'category', 'category_name', 'vendor_no', 'vendor_name', 'itemno', 'im_desc']
placeholders = generate_placeholder_dict(df, dimension_cols)

placeholders

{'store': 9999,
 'name': 'UNKNOWN',
 'city': 'UNKNOWN',
 'zipcode': 99999,
 'county': 'UNKNOWN',
 'category': 9999999,
 'category_name': 'UNKNOWN',
 'vendor_no': 999,
 'vendor_name': 'UNKNOWN',
 'itemno': 99999,
 'im_desc': 'UNKNOWN'}

In [63]:
# Save dictionary to a JSON file
with open('dicts/placeholders.json', 'w') as file:
    json.dump(placeholders, file)

In [64]:
df = df.fillna(value=placeholders)
missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


### 2.3. Checking Data Type

Verify if specific columns, such as indexes, are convertible to integers and if the date column can be formatted as datetime for compatibility with corresponding MySQL server data types

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 423779 entries, 0 to 425023
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   invoice_line_no      423779 non-null  object
 1   date                 423779 non-null  object
 2   store                423779 non-null  object
 3   name                 423779 non-null  object
 4   city                 423779 non-null  object
 5   zipcode              423779 non-null  object
 6   county               423779 non-null  object
 7   category             423779 non-null  object
 8   category_name        423779 non-null  object
 9   vendor_no            423779 non-null  object
 10  vendor_name          423779 non-null  object
 11  itemno               423779 non-null  object
 12  im_desc              423779 non-null  object
 13  state_bottle_cost    423779 non-null  object
 14  state_bottle_retail  423779 non-null  object
 15  sale_bottles         423779 non-null  o

#### 2.3.1. Datetime Columns

In [66]:
try:
    pd.to_datetime(df['date'])
    print("The 'date' column is in a suitable format for SQL date type.")
except (ValueError, TypeError):
    print("The 'date' column is NOT in a suitable format for SQL date type.")

The 'date' column is in a suitable format for SQL date type.


#### 2.3.2. Integer Columns

In [67]:
num_col_dtype_map = {}

for col in df.columns:
    if df[col].dtype == 'object' and col != 'date':
        converted_col = pd.to_numeric(df[col], errors='coerce')
        if not converted_col.isna().any():
            num_col_dtype_map[col] = 'int'
            print(f"The column '{col}' can be fully converted to integers.")
        else:
            print(f"The column '{col}' cannot be fully converted to integers.")

print(num_col_dtype_map)

The column 'invoice_line_no' cannot be fully converted to integers.
The column 'store' can be fully converted to integers.
The column 'name' cannot be fully converted to integers.
The column 'city' cannot be fully converted to integers.
The column 'zipcode' can be fully converted to integers.
The column 'county' cannot be fully converted to integers.
The column 'category' can be fully converted to integers.
The column 'category_name' cannot be fully converted to integers.
The column 'vendor_no' can be fully converted to integers.
The column 'vendor_name' cannot be fully converted to integers.
The column 'itemno' can be fully converted to integers.
The column 'im_desc' cannot be fully converted to integers.
The column 'state_bottle_cost' can be fully converted to integers.
The column 'state_bottle_retail' can be fully converted to integers.
The column 'sale_bottles' can be fully converted to integers.
{'store': 'int', 'zipcode': 'int', 'category': 'int', 'vendor_no': 'int', 'itemno': 'i

In [68]:
# Save dictionary to a JSON file
with open('dicts/num_col_dtype_map.json', 'w') as file:
    json.dump(num_col_dtype_map, file)

### 2.4. Data Validation

In [69]:
def convert_to_numeric(df, col):
    # Convert column to float first
    df[col] = df[col].astype(float)
    # If the float value is equal to its integer conversion, change to int
    df[col] = df[col].apply(lambda x: int(x) if x == int(x) else x)

columns_to_convert = ['state_bottle_cost', 'state_bottle_retail', 'sale_bottles']
for col in columns_to_convert:
    convert_to_numeric(df, col)

In [70]:
df.describe()

Unnamed: 0,state_bottle_cost,state_bottle_retail,sale_bottles
count,423779.0,423779.0,423779.0
mean,13.743257,20.617133,11.821501
std,13.889716,20.834971,48.022667
min,0.95,1.43,-648.0
25%,7.35,11.03,3.0
50%,10.92,16.38,6.0
75%,16.17,24.26,12.0
max,2298.84,3448.26,13200.0


In [71]:
# Check the percentage of rows where 'sale_bottles' is negative
len(df[df['sale_bottles']<0])/len(df) * 100

0.1531458614041753

**Note**: While negative sales data typically indicate refunds or returns, our analysis will omit these records to maintain data integrity. 

This decision is based on the focus of our study, which requires consistent and positive sales figures to accurately assess trends and patterns.


In [72]:
df=df[df['sale_bottles']>=0]
df.describe()

Unnamed: 0,state_bottle_cost,state_bottle_retail,sale_bottles
count,423130.0,423130.0,423130.0
mean,13.740855,20.61353,11.857082
std,13.889348,20.834419,48.037198
min,0.95,1.43,1.0
25%,7.33,11.0,3.0
50%,10.92,16.38,6.0
75%,16.05,24.08,12.0
max,2298.84,3448.26,13200.0
