# ETL

In [1]:
import pandas as pd
import hashlib
import time
from datetime import datetime

filename = "SouJunior_competitor_analytics_1717580626326.xlsx"
df = pd.read_excel(
    "files/" + filename,
    dtype=str,
    skiprows=1,  # necessário para os arquivos da categoria Concorrentes
    # sheet_name=1,
)

  warn("Workbook contains no default style, apply openpyxl's default")


In [4]:
def generate_hash_id(*args):
    row_str = "".join(args)
    return hashlib.sha256(row_str.encode()).hexdigest()


def etl(df):
    columns_name = [
        col.lower().replace(" ", "_").replace("(", "").replace(")", "")
        for col in df.columns
    ]
    df.columns = columns_name

    df["id"] = df.apply(lambda row: generate_hash_id(str(row)), axis=1)
    df["origem"] = filename
    df["timestamp_ingestão"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    df["data_competencia"] = datetime.now().strftime("%Y-%m-01")
    df["fonte"] = "linkedin"
    return df

In [5]:
df = etl(df)
df

Unnamed: 0,page,total_de_seguidores,novos_seguidores,total_de_engajamentos_da_publicação,total_de_publicações,id,origem,timestamp_ingestão,data_competencia,fonte
0,SouJunior,95534,3583,610,44,15db44d0a04a84e016a76e1ee7dc801e3610e3afac9afb...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin
1,Le Wagon,78844,1004,547,9,654608c808596c76b65fc899071e4f8dece829fd725cd5...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin
2,GitHub Brasil,153069,4219,1139,9,df6a950ecfc5e3818add69b1584a288c016d2474cab84a...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin
3,Tech Pro Bem,13677,310,5,0,16f320793b9526553d2f39a459973c1aff2e1fa9d69901...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin
4,Trampar na Gringa,137777,4991,1213,12,f5fec2a0f6619162244f75783add2deaad7b8a386ddb0f...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin
5,VagasUX,44338,550,627,16,94a85411afe535ec8d1d315c292e96415d7644da406997...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin
6,Bichinhos da TI,9321,960,100,4,9f9c3f5d1128a337f9540a2e04ab50cd5da521cc17e408...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 01:07:25,2025-01-01,linkedin


# Ingest

In [6]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

load_dotenv()

username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database = os.getenv("DB_DATABASE")

engine = create_engine(
    f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}"
)

with engine.connect() as conn:
    result = conn.execute(text("SELECT 1"))
    print(result.scalar())

1


In [23]:
def create_query(df_columns, table_name):
    # print(df_columns)
    query = f"CREATE TABLE IF NOT EXISTS {table_name} ("
    query += """id VARCHAR(255) PRIMARY KEY, 
                timestamp_ingestão TIMESTAMP, 
                data_competencia DATE"""

    for col in df_columns:
        if col == "id" or col == "timestamp_ingestão" or col == "data_competencia":
            continue
        else:
            query += f",{col} TEXT"

    query += ");"

    return query


def create_table(df, table_name):
    df_columns = df.columns.to_list()
    query = create_query(df_columns, table_name)

    try:
        with engine.connect() as conn:
            conn.execute(text(query))
            print("tabela criada")

    except Exception as e:
        print("erro ao criar tabela")
        print(e)

create_table(df, "linkedin_concorrentes_concorrentes")

tabela criada


In [25]:
def ingest_data(df, table_name):
    try:
        df.to_sql(
            table_name,
            con=engine,
            if_exists="append",
            index=False,
        )
        print("Dados inseridos com sucesso!")
    except Exception as e:
        print(e)

ingest_data(df, "linkedin_concorrentes_concorrentes")

Dados inseridos com sucesso!
