Introduction

---
The following notebook is part of data pipeline project using the medallion architecture. In this notebook, we have created a s3 bucket and upload the project datasets using the aws boto3 library. In the project repository, you'll find the next steps.



In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import DataFrame
from functools import reduce
import findspark
import boto3
import os
findspark.init()

In [36]:
os.environ["AWS_ACCESS_KEY_ID"] = "AWS_ACCESS_KEY_ID"
os.environ["AWS_SECRET_ACCESS_KEY"] = "AWS_SECRET_ACCESS_KEY"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

In [37]:
aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
region_name = os.environ.get("AWS_DEFAULT_REGION")

In [38]:
spark = (SparkSession.builder
        .appName("ColabS3Upload")
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.access.key", aws_access_key_id)
        .config("spark.hadoop.fs.s3a.secret.key", aws_secret_access_key)
        .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
        .getOrCreate()
      )

In [39]:
def read_dataset(path:str, comma:bool=True)-> DataFrame:
  sep = ',' if comma else ';'
  df = spark.read.csv(path, sep=sep, header=True)
  return df

In [40]:
def create_bucket(bucket_name:str):
  s3 = boto3.client(
      's3',
      aws_access_key_id= aws_access_key_id,
      aws_secret_access_key= aws_secret_access_key,
      region_name=region_name
  )
  s3.create_bucket(Bucket=bucket_name)

In [None]:
bucket_name = 'project-pnad-covid-bronze-layer'
create_bucket(bucket_name)

In [43]:
path_2020_05 = '/content/PNAD_COVID_052020.csv'
path_2020_06 = '/content/PNAD_COVID_062020.csv'
path_2020_07 = '/content/PNAD_COVID_072020.csv'
path_2020_08 = '/content/PNAD_COVID_072020.csv'
path_2020_09 = '/content/PNAD_COVID_092020.csv'
path_2020_10 = '/content/PNAD_COVID_102020.csv'
path_2020_11 = '/content/PNAD_COVID_112020.csv'
path_dict = '/content/dicionario_PNAD_COVID_112020_20220621.csv'

# Reading project datasets
df_2020_05 = read_dataset(path_2020_05)
df_2020_06 = read_dataset(path_2020_06)
df_2020_07 = read_dataset(path_2020_07)
df_2020_08 = read_dataset(path_2020_08)
df_2020_09 = read_dataset(path_2020_09)
df_2020_10 = read_dataset(path_2020_10)
df_2020_11 = read_dataset(path_2020_11)
df_dict = read_dataset(path = path_dict, comma = False)

In [44]:
df_2020_05.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_052020_parquet/")
df_2020_06.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_062020_parquet/")
df_2020_07.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_072020_parquet/")
df_2020_08.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_082020_parquet/")
df_2020_09.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_092020_parquet/")
df_2020_10.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_102020_parquet/")
df_2020_11.write.mode("overwrite").parquet(f"s3a://{bucket_name}/PNAD_COVID_112020_parquet/")
df_dict.write.mode("overwrite").parquet(f"s3a://{bucket_name}/dicionario_PNAD_COVID_parquet/")