In [80]:
import boto3
import pandas as pd
import io
from sqlalchemy import create_engine
from typing import Tuple
import datetime
import pymysql
import botocore
import fastavro
def connect_aws(aws_access_key_id: str, aws_secret_access_key: str, aws_region_name: str):
    try:
        s3 = boto3.resource('s3', aws_access_key_id=aws_access_key_id,
                                    aws_secret_access_key=aws_secret_access_key,
                                    region_name=aws_region_name)
        print(f"Connected to AWS S3 in {aws_region_name} region")
        return s3
    except botocore.exceptions.NoCredentialsError:
        print("AWS credentials not found or invalid.")
    except botocore.exceptions.ClientError as e:
        print(f"Failed to connect to AWS S3: {e}")
    except Exception as e:
        print(f"An error occurred while connecting to AWS S3: {e}")
        
def connect_bd(user, password, host,port,db):
    try:
        conn = pymysql.connect(
        user=user,
        password=password,
        host=host,
        port=port,
        db=db,
        charset='utf8mb4',
        cursorclass=pymysql.cursors.DictCursor
        )
        print(f"Connected to db  {db} ")
        return conn
    except botocore.exceptions.NoCredentialsError:
        print("AWS credentials not found or invalid.")
    except botocore.exceptions.ClientError as e:
        print(f"Failed to connect to bd: {e}")
    except Exception as e:
        print(f"An error occurred while connecting to bd: {e}")

def read_db(conn,name_table,db):
    try:
        with conn.cursor() as cursor:
            # Realizar la consulta
            sql = f"select * from {db}.{name_table}"
            cursor.execute(sql)
            result = cursor.fetchall()
    finally:
        conn.close()

    df = pd.DataFrame(result)
    column_names = [i[0] for i in cursor.description]
    df.columns = column_names



    data = df.to_dict(orient="records")
    return  data

def creaate_avro_schema(name_table):
    # Crear un archivo AVRO en memoria
    print(name_table)
    if name_table=="jobs":
        avro_schema = {
            'namespace': 'example.avro',
            'type': 'record',
            'name': 'Job',
            'fields': [
                {'name': 'id', 'type': 'int'},
                {'name': 'job', 'type': 'string'}
            ]
        }
    elif name_table=="departments":
         avro_schema = {
            'namespace': 'example.avro',
            'type': 'record',
            'name': 'departments',
            'fields': [
                {'name': 'id', 'type': 'int'},
                {'name': 'department', 'type': 'string'}
            ]
        }

    elif name_table=="hired_employees":
        avro_schema = {
        'namespace': 'example.avro',
        'type': 'record',
        'name': 'hired_employees',
        'fields': [
            {'name': 'id', 'type': 'int'},
            {'name': 'name', 'type': ['string','null']},
            {'name': 'datetime', 'type': ['string','null']},
            {'name': 'department_id', 'type': ['int','null','float']},
            {'name': 'job_id', 'type': ['int','null','float']}
        ]
    }
    return avro_schema


def get_today_date():
    today = datetime.date.today()
    return today.strftime('%d-%m-%Y')

def avro_to_s3(name_table,data,s3):
    avro_schema=creaate_avro_schema(name_table)

    today=get_today_date()
    avro_bytes = io.BytesIO()
    fastavro.writer(avro_bytes, avro_schema, data)
    s3.Object(s3_bucket_name, f"backup/{today}/{name_table}_table.avro").put(Body=avro_bytes.getvalue())

if __name__ == "__main__":
    user="admin"
    password="12345678"
    host="mydb.cjt7teobtbru.us-east-1.rds.amazonaws.com"
    port=3306
    db="Globant"
    list_name_table=['jobs','departments','hired_employees']
    aws_access_key_id = 'AKIA4EUEBZDHFV3BYTMI'
    aws_secret_access_key = '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'

    aws_region_name = 'us-east-1'
    s3_bucket_name = 'info-globant'
    s3_prefix = ''


    #connect to aws   
    s3=connect_aws(aws_access_key_id,aws_secret_access_key,aws_region_name)

    for name_table in list_name_table:
        # connect to db
        conn=connect_bd(user, password, host,port,db)
        # read db
        data=read_db(conn,name_table,db)
        #upload s3
        avro_to_s3(name_table,data,s3)




Connected to AWS S3 in us-east-1 region
Connected to db  Globant 
jobs
Connected to db  Globant 
departments
Connected to db  Globant 
hired_employees


In [None]:
data

In [None]:
data = [{'id': 1, 'name': 'Harold Vogt', 'datetime': parser.parse('2021-11-07T02:48:42Z'), 'department_id': 2.0, 'job_id': 96.0},    {'id': 2, 'name': 'Ty Hofer', 'datetime': parser.parse('2021-05-30T05:43:46Z'), 'department_id': 8.0, 'job_id': nan},    {'id': 3, 'name': 'Lyman Hadye', 'datetime': parser.parse('2021-09-01T23:27:38Z'), 'department_id': 5.0, 'job_id': 52.0},    {'id': 4, 'name': 'Lotti Crowthe', 'datetime': parser.parse('2021-10-01T13:04:21Z'), 'department_id': 12.0, 'job_id': 71.0}]
