In [1]:
import pandas as pd
import numpy as np

# Import etl.py module

In [2]:
# add src to module search path 

import os
import sys

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))

if os.name == "nt":
    path_separator = "\\"
else:
    path_separator = "/"
    
module_path = f"{parent_dir}{path_separator}src"
sys.path.append(module_path)

# import custom module
from etl import DataPreprocessing

# How to use the module

To demonstrate the features of the module, we perform a very easy ETL pipeline, where we extract raw data from a csv, remove duplicate rows, and store the tranformed dataset in a csv file.

In [3]:
file_path = f'..{path_separator}data{path_separator}raw{path_separator}interview_signup.csv'

# class instantiation
my_class = DataPreprocessing(file_path)

In [4]:
# load raw data from csv
df = my_class.load_data_from_csv(# encoding='utf-8', 
                                 sep=',',
                                 header=0,
                                 dtype={'original_product_name': str,
                                        'postcode': str,
                                        'bundesland': str,
                                        'total_bonus': 'float64',
                                        'order_date': str})
df.shape

(318345, 5)

In [5]:
# delete duplicate rows
df = my_class.remove_duplicate_rows()
df.shape

(318175, 5)

In [13]:
my_class.validate_postcode("postcode")

0          True
1         False
2          True
3          True
4          True
          ...  
318340     True
318341    False
318342    False
318343     True
318344     True
Name: postcode, Length: 318175, dtype: bool

In [14]:
# check number of missing values for each column
columns = list(my_class.df.columns)
my_class.missing_values(columns).sum()

original_product_name        0
postcode                     0
bundesland               29521
total_bonus                  0
order_date                   0
dtype: int64

In [15]:
# return logical index of all rows with missing state
my_class.missing_values("bundesland")

Unnamed: 0,bundesland
0,False
1,False
2,False
3,False
4,False
...,...
318340,False
318341,True
318342,False
318343,False


In [16]:
# store preprocessed csv in folder data/processed
file_path_processed = f"..{path_separator}data{path_separator}processed{path_separator}demo_etl_module_processed.csv"
my_class.save_data_to_csv(file_path_processed, index=False)