In [1]:
import pandas as pd
import numpy as np

# Import etl.py module

In [2]:
# add src to module search path 

import os
import sys

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

if os.name == "nt":
    path_separator = "\\"
else:
    path_separator = "/"
    
module_path = f"{parent_dir}{path_separator}src"
sys.path.append(module_path)

# import custom module
from etl import DataPreprocessing

# How to use the module

To demonstrate the features of the module, we perform a very easy ETL pipeline, where we extract raw data from a csv, remove duplicate rows, and store the tranformed dataset in a csv file.

In [3]:
file_path = f'..{path_separator}data{path_separator}raw{path_separator}interview_signup.csv'

# class instantiation
my_class = DataPreprocessing(file_path)

In [5]:
# load raw data from csv
sep = ','
header = 0
dtype = {'original_product_name': str,
         'postcode'             : str,
         'bundesland'           : str,
         'total_bonus'          : 'float64',
         'order_date'           : str}

my_class.load_data_from_csv(# encoding='utf-8', 
                            sep=sep,
                            header=header,
                            dtype=dtype)
my_class.df.shape

(318345, 5)

In [7]:
# column names
my_class.df.columns

Index(['original_product_name', 'postcode', 'bundesland', 'total_bonus',
       'order_date'],
      dtype='object')

Remove duplicate rows:

In [8]:
# delete duplicate rows
my_class.remove_duplicate_rows()
my_class.df.shape

(318175, 5)

Check missing data

In [9]:
# check number of missing values for each column
columns = list(my_class.df.columns)
my_class.missing_values(columns).sum()

# alt.: my_class.df.info()

original_product_name        0
postcode                     0
bundesland               29521
total_bonus                  0
order_date                   0
dtype: int64

In [10]:
# returns logical index of all rows with missing state
idx_missing_state = my_class.missing_values('bundesland')

# Return sample of filled states
my_class.df.loc[~idx_missing_state, 'bundesland'].sample(10)

77271       Baden-Württemberg
185532      Baden-Württemberg
104805        Rheinland-Pfalz
281760            Brandenburg
227704            Brandenburg
267961                 Bayern
264005                Sachsen
312390          Niedersachsen
114214          Niedersachsen
54115     Nordrhein-Westfalen
Name: bundesland, dtype: object

Inspect the invalid postcode cases:

In [11]:
print("Number of valid postcodes: ", my_class.validate_postcode("postcode").sum())
idx_valid_postcodes = my_class.validate_postcode("postcode")

# return sample of 20 invalid postcodes
my_class.df.loc[~idx_valid_postcodes, 'postcode'].sample(20)

Number of valid postcodes:  226865


151539     9212.0
171711    55120.0
184509    57258.0
90954     66119.0
184141    24644.0
155336    41065.0
210185    98587.0
175711    86978.0
180833    77871.0
161620    23627.0
295899    64653.0
92570      3249.0
204672    45968.0
92986     84034.0
310886    32289.0
20387        9661
100614    44799.0
170118    22081.0
227894    83533.0
220917    39524.0
Name: postcode, dtype: object

In [12]:
# Remove decimals and check again
my_class.remove_decimals('postcode')

# remaining invalid postcodes
idx_valid_postcodes = my_class.validate_postcode('postcode')
print("Remaining invalid postcodes: ", my_class.df.shape[0] - idx_valid_postcodes.sum())

# show sample of 20 remaining invalid cases
my_class.df.loc[~idx_valid_postcodes, 'postcode'].sample(20)

78598 entries were changed.

Remaining invalid postcodes:  16610


283411    7389
172197    6749
99452     6366
24710     3096
69345     9235
16246     1662
129440    7616
208680    1277
159313    4289
127016    7743
217390    6295
153659    4880
131270    8258
48640     4889
166348    7907
175854    4347
224756    6862
77007     9353
266439    6333
200475    7318
Name: postcode, dtype: object

In [13]:
# check for postcodes with more than 5 digits
idx_postcode_more_than_5 = my_class.df['postcode'].str.len()>5

my_class.df.loc[idx_postcode_more_than_5, 'postcode']

266922    92696JAVAS
Name: postcode, dtype: object

In [14]:
# remaining invalid postcodes
print(my_class.df.shape[0] - my_class.validate_postcode('postcode').sum())

16610


In [15]:
# convert type 
my_class.df['bundesland'] = my_class.df['bundesland'].astype("category")

In [16]:
my_class.df.dtypes

original_product_name      object
postcode                   object
bundesland               category
total_bonus               float64
order_date                 object
dtype: object

In [17]:
my_class.df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 318175 entries, 0 to 318344
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   original_product_name  318175 non-null  object  
 1   postcode               318175 non-null  object  
 2   bundesland             288654 non-null  category
 3   total_bonus            318175 non-null  float64 
 4   order_date             318175 non-null  object  
dtypes: category(1), float64(1), object(3)
memory usage: 12.4+ MB


In [18]:
# count number of valid German states 
my_class.validate_state('bundesland').sum()

288654

In [19]:
# store preprocessed csv in folder data/processed
file_path_processed = f"..{path_separator}data{path_separator}processed{path_separator}demo_etl_module_processed.csv"
my_class.save_data_to_csv(file_path_processed, index=False)