# Data Transformation

## 1. Loading Data

In [1]:
import pandas as pd
import os

In [2]:
directory = 'data/'
file_pattern = 'hyvee_'
dataframes = []

# Loop through each file in the directory
for file in os.listdir(directory):
    if file.startswith(file_pattern) and file.endswith('.csv'):
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)

In [3]:

# Combine all DataFrames into a single DataFrame
df = pd.concat(dataframes, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
0,INV-16685400042,2019-01-02T00:00:00.000,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1031100.0,AMERICAN VODKAS,301.0,FIFTH GENERATION INC,38177,TITOS HANDMADE VODKA,12.67,19.01,36,684.36
1,INV-16685400057,2019-01-02T00:00:00.000,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1062200.0,WHITE RUM,434.0,LUXCO INC,46351,HAWKEYE LIGHT RUM,4.34,6.51,12,78.12
2,INV-16679100094,2019-01-02T00:00:00.000,2549,HY-VEE FOOD STORE / INDIANOLA,INDIANOLA,50125.0,WARREN,1081200.0,CREAM LIQUEURS,305.0,MHW LTD,73052,RUMCHATA,7.0,10.5,2,21.0
3,INV-16685400104,2019-01-02T00:00:00.000,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1022100.0,MIXTO TEQUILA,434.0,LUXCO INC,87937,JUAREZ TEQUILA SILVER,7.83,11.75,12,141.0
4,INV-16682800080,2019-01-02T00:00:00.000,2513,HY-VEE FOOD STORE #2 / IOWA CITY,IOWA CITY,52240.0,JOHNSON,1031100.0,AMERICAN VODKAS,297.0,LAIRD & COMPANY,35926,FIVE O'CLOCK VODKA PET,3.37,5.06,12,60.72


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3727001 entries, 0 to 3727000
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   invoice_line_no      object 
 1   date                 object 
 2   store                int64  
 3   name                 object 
 4   city                 object 
 5   zipcode              float64
 6   county               object 
 7   category             float64
 8   category_name        object 
 9   vendor_no            float64
 10  vendor_name          object 
 11  itemno               int64  
 12  im_desc              object 
 13  state_bottle_cost    float64
 14  state_bottle_retail  float64
 15  sale_bottles         int64  
 16  sale_dollars         float64
dtypes: float64(6), int64(3), object(8)
memory usage: 483.4+ MB


# 2. Cleaning Data

### 2.1. Revoving Duplicate Rows

In [6]:
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 5099; Percentage of duplicate rows: 0.14%


In [7]:
# Despite the potential for duplicates arising from reasons other than data entry errors, 
# I'm opting to remove them as they constitute a minimal proportion of the dataset (only 0.14%). 
# This decision is based on the assumption that the impact on data integrity will be negligible while simplifying further analysis.

df = df.drop_duplicates()
num_dup = df.duplicated().sum()
perc_dup = num_dup / df.shape[0] * 100 
print(f'Number of duplicate rows: {num_dup}; Percentage of duplicate rows: {perc_dup:.2f}%')

Number of duplicate rows: 0; Percentage of duplicate rows: 0.00%


### 2.2. Handelling Missing Values

In [8]:
def missing_values_table(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input is not a pandas DataFrame")

    num_na = df.isnull().sum()
    perc_na = (num_na / len(df)) * 100

    df_na = pd.DataFrame({
        'Missing Values': num_na,
        '% of Total Values': perc_na
    })

    df_na = df_na[df_na['Missing Values'] != 0]

    return df_na

missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values
city,2105,0.056557
zipcode,2105,0.056557
county,2105,0.056557
category,1317,0.035385
category_name,1317,0.035385
vendor_no,1,2.7e-05
vendor_name,1,2.7e-05


All of the missing data pertains to contextual information. 
Despite its relatively small proportion, these rows include crucial sales data, pivotal for our analysis. 
Therefore, we will retain these rows by substituting missing values with placeholders, ensuring that valuable sales insights are preserved.

In [9]:
placeholders = {
    'city'          : 'UNKNOWN',  
    'county'        : 'UNKNOWN',  
    'category_name' : 'UNKNOWN',
    'vendor_name'   : 'UNKNOWN',
    'zipcode'       : 99999,
    'category'      : 9999999,
    'vendor_no'     : 999
}

df = df.fillna(value=placeholders)
missing_values_table(df)

Unnamed: 0,Missing Values,% of Total Values


### 2.3. Converting Data Type

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3721902 entries, 0 to 3727000
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   invoice_line_no      object 
 1   date                 object 
 2   store                int64  
 3   name                 object 
 4   city                 object 
 5   zipcode              float64
 6   county               object 
 7   category             float64
 8   category_name        object 
 9   vendor_no            float64
 10  vendor_name          object 
 11  itemno               int64  
 12  im_desc              object 
 13  state_bottle_cost    float64
 14  state_bottle_retail  float64
 15  sale_bottles         int64  
 16  sale_dollars         float64
dtypes: float64(6), int64(3), object(8)
memory usage: 511.1+ MB


#### 2.3.1. Datetime Columns

In [11]:
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,invoice_line_no,date,store,name,city,zipcode,county,category,category_name,vendor_no,vendor_name,itemno,im_desc,state_bottle_cost,state_bottle_retail,sale_bottles,sale_dollars
0,INV-16685400042,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1031100.0,AMERICAN VODKAS,301.0,FIFTH GENERATION INC,38177,TITOS HANDMADE VODKA,12.67,19.01,36,684.36
1,INV-16685400057,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1062200.0,WHITE RUM,434.0,LUXCO INC,46351,HAWKEYE LIGHT RUM,4.34,6.51,12,78.12
2,INV-16679100094,2019-01-02,2549,HY-VEE FOOD STORE / INDIANOLA,INDIANOLA,50125.0,WARREN,1081200.0,CREAM LIQUEURS,305.0,MHW LTD,73052,RUMCHATA,7.0,10.5,2,21.0
3,INV-16685400104,2019-01-02,2524,HY-VEE FOOD STORE / DUBUQUE,DUBUQUE,52001.0,DUBUQUE,1022100.0,MIXTO TEQUILA,434.0,LUXCO INC,87937,JUAREZ TEQUILA SILVER,7.83,11.75,12,141.0
4,INV-16682800080,2019-01-02,2513,HY-VEE FOOD STORE #2 / IOWA CITY,IOWA CITY,52240.0,JOHNSON,1031100.0,AMERICAN VODKAS,297.0,LAIRD & COMPANY,35926,FIVE O'CLOCK VODKA PET,3.37,5.06,12,60.72


#### 2.3.2. Integer Columns

##### 2.3.2.1. Invoice Line Number

In [12]:
starts_with_INV = df['invoice_line_no'].str.startswith('INV-').all()

if starts_with_INV:
    print("All rows start with 'INV-'")
else:
    print("Not all rows start with 'INV-'")

Not all rows start with 'INV-'


In [13]:
df['invoice_line_no'].str[:4].unique()

array(['INV-', 'RINV'], dtype=object)

In [14]:
df['invoice_line_no'].str.extract('(\d+)').duplicated().sum()

0

In [15]:
df['invoice_line_no'].str.extract('(\d+)').iloc[:,1].unique()

IndexError: single positional indexer is out-of-bounds

In [None]:
# Create a new column with 'R' removed
df['R_invoice_line_no'] = df['invoice_line_no'].apply(lambda x: x[1:] if x.startswith('R') else x)

# Check for duplicates in the modified column
df['R_invoice_line_no'].duplicated().sum()

In [None]:
df['R_invoice_line_no'].str[:4].unique()

In [None]:
df['R_invoice_line_no'] = df['R_invoice_line_no'].apply(lambda x: x[4:])
df['R_invoice_line_no'].str[:1].unique()

In [None]:
df['R_invoice_line_no'].duplicated().sum()

In [None]:
df['invoice_line_no'] = df['R_invoice_line_no']
df['invoice_line_no'] = df['invoice_line_no'].astype('int')
df = df.drop(columns='R_invoice_line_no', axis=1)

In [None]:
# Convert some float cols to int. 
# Define a function to check the existance of decimals first before implementation. 
def safe_to_int(df, columns):
    for col in columns:
        # Check if the column contains any non-integer values
        if df[col].apply(lambda x: x % 1).any():
            print(f"Column '{col}' contains decimal values. Conversion to int skipped.")
        else:
            # Convert to int
            df[col] = df[col].astype(int)
            print(f"Column '{col}' successfully converted to int.")

In [None]:
cols_to_int = ['zipcode', 'category','vendor_no' ]
safe_to_int(df, cols_to_int)