In [1]:
import pandas as pd

# ETL Simple Pipeline

## Extract Data

In [2]:
countries = pd.read_csv('../data/input/Countries.csv')
countries

Unnamed: 0,Countries,Population
0,Austria,9006398
1,Bolivia,11673021
2,China,1439323776
3,Denmark,5792202
4,Egypt,102334404
5,Ethiopia,114963588
6,Finland,5540720
7,France,65273511
8,Germany,83783942
9,Greece,10423054


In [3]:
countries_metadata = pd.read_csv('../data/input/Countries_metadata.csv')
countries_metadata

Unnamed: 0,country_names,Land_Area,Region
0,Greece,128900,Europe
1,China,9388211,Asia
2,Denmark,42430,Europe
3,Ethiopia,1000000,Africa
4,Egypt,995450,Africa
5,Bolivia,1083300,South America
6,Austria,82409,Europe
7,France,547557,Europe
8,Germany,348560,Europe
9,Finland,303890,Europe


## Transform

In [5]:
countries['Population'] = countries['Population']/1000
countries = countries.rename(columns={'Population':'Population_per_k'})
countries

Unnamed: 0,Countries,Population_per_k
0,Austria,9.006398
1,Bolivia,11.673021
2,China,1439.323776
3,Denmark,5.792202
4,Egypt,102.334404
5,Ethiopia,114.963588
6,Finland,5.54072
7,France,65.273511
8,Germany,83.783942
9,Greece,10.423054


In [6]:
countries_metadata['Land_Area'] = countries_metadata['Land_Area']/1000
countries_metadata = countries_metadata.rename(columns={'Land_Area':'Land_Area_per_k'})
countries_metadata

Unnamed: 0,country_names,Land_Area_per_k,Region
0,Greece,128.9,Europe
1,China,9388.211,Asia
2,Denmark,42.43,Europe
3,Ethiopia,1000.0,Africa
4,Egypt,995.45,Africa
5,Bolivia,1083.3,South America
6,Austria,82.409,Europe
7,France,547.557,Europe
8,Germany,348.56,Europe
9,Finland,303.89,Europe


## Load Data

In [None]:
countries.to_csv('../data/output/countries.csv')
countries_metadata.to_csv('../data/output/countries_metadata.csv')

# Create `etl_pipeline.py`

In [None]:

import pandas as pd
import numpy as np
import os


class DataPreprocessor:
    def __init__(self, path_folder = "path/to/root directory"):

        self.path_folder = path_folder
        
        # Path to input
        self.path_input_folder = "{}/input/".format(path_folder)
        self.path_input_countries = self.path_input_folder + 'Countries.csv'
        self.path_input_countries_metadata = self.path_input_folder + 'Countries_metadata.csv'

        # Path on which output tables are saved
        self.path_output_folder = "{}/output/".format(path_folder)
        self.path_output_countries = self.path_output_folder + 'Countries.csv'
        self.path_output_countries_metadata = self.path_output_folder + 'Countries_metadata.csv'

        # create dictionaries for read dtypes
        self.read_dtypes_countries = {'Countries':'category'}
        self.read_dtypes_countries_metadata = {'country_names':'category'}

        # create folders for output if not existent yet
        if not os.path.exists(self.path_output_folder):
            os.makedirs(self.path_output_folder) 


    def read_data_from_raw_input(self):

        print("Start:\tRead in countries Dataset")
        self.countries = pd.read_csv(self.path_input_countries, dtype=self.read_dtypes_countries)
        print("Finish:\tRead in countries Dataset")

        print("Start:\tRead in countries_metadata Dataset")       
        self.countries_metadata = pd.read_csv(self.path_input_countries_metadata, dtype=self.read_dtypes_countries_metadata)
        print("Finish:\tRead in countries_metadata Dataset")


    def preprocess_data(self, save_preprocess_countries=True, save_preprocess_countries_metadata=True):

        print("Start:\tPreprocessing countries Dataset")
        self.preprocess_countries()
        print("Finish:\tPreprocessing countries Dataset")

        print("Start:\tPreprocessing countries_metadata Dataset")
        self.preprocess_countries_metadata()
        print("Finish:\tPreprocessing countries_metadata Dataset")

        if save_preprocess_countries:
            print("Start:\tSave countries Dataset to disc")
            self.countries.to_csv(self.path_output_countries, index=False)
            print("Finish:\tSave countries Dataset to disc")

        if save_preprocess_countries_metadata:
            print("Start:\tSave countries_metadata Dataset to disc")
            self.countries_metadata.to_csv(self.path_output_countries_metadata, index=False)
            print("Finish:\tSave countries_metadata Dataset to disc")

        return self.countries, self.countries_metadata


    def preprocess_countries(self):
        
        self.countries['Population'] = self.countries['Population']/1000
        self.countries = self.countries.rename(columns={'Population':'Population_per_k'})


    def preprocess_countries_metadata(self):
        
        self.countries_metadata['Land_Area'] = self.countries_metadata['Land_Area']/1000
        self.countries_metadata = self.countries_metadata.rename(columns={'Land_Area':'Land_Area_per_k'})


    def read_preprocessed_tables(self):
        
        print("Start:\tRead in modified countries Dataset")
        self.countries = pd.read_csv(self.path_output_countries, dtype=self.read_dtypes_countries)
        print("Finish:\tRead in modified countries Dataset")

        print("Start:\tRead in modified countries_metadata Dataset")       
        self.countries_metadata = pd.read_csv(self.path_output_countries_metadata, dtype=self.read_dtypes_countries_metadata)
        print("Finish:\tRead in modified countries_metadata Dataset")

        return self.countries, self.countries_metadata


def main():

    datapreprocesssor = DataPreprocessor()
    datapreprocesssor.read_data_from_raw_input()
    datapreprocesssor.preprocess_data()
    print('ETL has been successfully completed !!')

#if __name__ == '__main__':
#    main()