# ETL

In [6]:
import pandas as pd
import hashlib
import time
from datetime import datetime

filename = "soujunior_content_1717580556486.xls"
sheets = ["Métricas", "Todas as publicações"]

df_metricas = pd.read_excel(
    "files/" + filename,
    dtype=str,
    skiprows=1,
    sheet_name=sheets[0],
)

df_publicacoes = pd.read_excel(
    "files/" + filename,
    dtype=str,
    skiprows=1,
    sheet_name=sheets[1],
)

In [28]:
def generate_hash_id(*args):
    row_str = "".join(args)
    return hashlib.sha256(row_str.encode()).hexdigest()


def etl(df):
    columns_name = [
        col.lower().replace(" ", "_").replace("(", "").replace(")", "")
        for col in df.columns
    ]
    df.columns = columns_name

    df["id"] = df.apply(lambda row: generate_hash_id(str(row)), axis=1)
    df["origem"] = filename
    df["timestamp_ingestão"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    df["data_competencia"] = datetime.now().strftime("%Y-%m-01")
    df["fonte"] = "linkedin"
    return df

In [29]:
df_metricas = etl(df_metricas)
df_publicacoes = etl(df_publicacoes)

# Ingest

In [25]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

load_dotenv()

username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database = os.getenv("DB_DATABASE")

engine = create_engine(
    f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}"
)

with engine.connect() as conn:
    result = conn.execute(text("SELECT 1"))
    print(result.scalar())

1


In [33]:
# query = """
#     CREATE TABLE IF NOT EXISTS linkedin_concorrentes_concorrentes (
#         id VARCHAR(255) PRIMARY KEY,
#         timestamp_ingestão TIMESTAMP,
#         data_competencia DATE,
#         page TEXT,
#         total_de_seguidores TEXT,
#         novos_seguidores TEXT,
#         total_de_engajamentos_da_publicação TEXT,
#         total_de_publicações TEXT,
#         origem TEXT,
#         fonte TEXT
#     );
# """
def create_query(df_columns, table_name):
    # print(df_columns)
    query = f"CREATE TABLE IF NOT EXISTS {table_name} ("
    query += """id VARCHAR(255) PRIMARY KEY, 
                timestamp_ingestão TIMESTAMP, 
                data_competencia DATE"""

    for col in df_columns:
        if col == "id" or col == "timestamp_ingestão" or col == "data_competencia":
            continue
        else:
            query += f",{col} TEXT"

    query += ");"

    return query


def create_table(df, table_name):
    df_columns = df.columns.to_list()
    query = create_query(df_columns, table_name)

    try:
        with engine.connect() as conn:
            conn.execute(text(query))
            print("tabela criada")

    except Exception as e:
        print("erro ao criar tabela")
        print(e)


create_table(df_metricas, "linkedin_conteudo_metricas")
create_table(df_publicacoes, "linkedin_conteudo_todas_as_publicacoes")

# with engine.connect() as conn:
#     conn.execute(text(query))
#     print("tabela criada")

tabela criada
tabela criada


In [34]:
def ingest_data(df, table_name):
    try:
        df.to_sql(
            table_name,
            con=engine,
            if_exists="append",
            index=False,
        )
        print("Dados inseridos com sucesso!")
    except Exception as e:
        print(e)

ingest_data(df_metricas, "linkedin_conteudo_metricas")
ingest_data(df_publicacoes, "linkedin_conteudo_todas_as_publicacoes")

Dados inseridos com sucesso!
Dados inseridos com sucesso!
