# ETL

In [24]:
import pandas as pd
import hashlib
import unidecode
from datetime import datetime

filename = "SouJunior_competitor_analytics_1717580626326.xlsx"
df = pd.read_excel(
    "files/" + filename,
    dtype=str,
    skiprows=1,  # necessário para os arquivos da categoria Concorrentes
    # sheet_name=1,
)

  warn("Workbook contains no default style, apply openpyxl's default")


In [38]:
def generate_hash_id(*args):
    row_str = "".join(args)
    return hashlib.sha256(row_str.encode()).hexdigest()

def normalize_string(s):
    return (
        unidecode.unidecode(s)
        .lower()
        .replace(" ", "_")
        .replace("(", "")
        .replace(")", "")
    )

def etl(df):
    columns_name = [normalize_string(col) for col in df.columns]
    df.columns = columns_name
    df = df.fillna("0")
    df["id"] = df.apply(lambda row: generate_hash_id(str(row)), axis=1)
    df["origem"] = filename
    df["timestamp_ingestao"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    df["data_competencia"] = datetime.now().strftime("%Y-%m-01")
    df["fonte"] = "linkedin"
    return df

In [26]:
df = etl(df)
df

Unnamed: 0,page,total_de_seguidores,novos_seguidores,total_de_engajamentos_da_publicacao,total_de_publicacoes,id,origem,timestamp_ingestao,data_competencia,fonte
0,SouJunior,95534,3583,610,44,7b5409341073faec3ac90b84cefc538489f39662c6d4e2...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
1,Le Wagon,78844,1004,547,9,29d79ab4f7647dda93fff8ce46c33fba1f463516a68b3c...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
2,GitHub Brasil,153069,4219,1139,9,7d02958b39851cad5ac32dfa0ad54581a481ed5a081bda...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
3,Tech Pro Bem,13677,310,5,0,1d9f1c2cd15b4dce73be88476e08101500a63735f27e60...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
4,Trampar na Gringa,137777,4991,1213,12,0bc62754d6390c36b370634e355d62c0e66998098fcfc4...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
5,VagasUX,44338,550,627,16,0a38f162396a40a69b3ca300bcb2be0e0c321573ae6614...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
6,Bichinhos da TI,9321,960,100,4,3ee2abc001014fc46fde96f16f3ca323535d876dc4fc7f...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin


# Ingest

In [27]:
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv

load_dotenv()

username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database = os.getenv("DB_DATABASE")

engine = create_engine(
    f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}"
)

with engine.connect() as conn:
    result = conn.execute(text("SELECT 1"))
    print(result.scalar())

1


In [33]:
print(df.columns)
df

Index(['page', 'total_de_seguidores', 'novos_seguidores',
       'total_de_engajamentos_da_publicacao', 'total_de_publicacoes', 'id',
       'origem', 'timestamp_ingestao', 'data_competencia', 'fonte'],
      dtype='object')


Unnamed: 0,page,total_de_seguidores,novos_seguidores,total_de_engajamentos_da_publicacao,total_de_publicacoes,id,origem,timestamp_ingestao,data_competencia,fonte
0,SouJunior,95534,3583,610,44,7b5409341073faec3ac90b84cefc538489f39662c6d4e2...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
1,Le Wagon,78844,1004,547,9,29d79ab4f7647dda93fff8ce46c33fba1f463516a68b3c...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
2,GitHub Brasil,153069,4219,1139,9,7d02958b39851cad5ac32dfa0ad54581a481ed5a081bda...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
3,Tech Pro Bem,13677,310,5,0,1d9f1c2cd15b4dce73be88476e08101500a63735f27e60...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
4,Trampar na Gringa,137777,4991,1213,12,0bc62754d6390c36b370634e355d62c0e66998098fcfc4...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
5,VagasUX,44338,550,627,16,0a38f162396a40a69b3ca300bcb2be0e0c321573ae6614...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin
6,Bichinhos da TI,9321,960,100,4,3ee2abc001014fc46fde96f16f3ca323535d876dc4fc7f...,SouJunior_competitor_analytics_1717580626326.xlsx,2025-01-14 20:12:23,2025-01-01,linkedin


In [36]:
query = """CREATE TABLE IF NOT EXISTS linkedin_concorrentes_concorrentes (
id VARCHAR(255) PRIMARY KEY,
timestamp_ingestao TIMESTAMP,
data_competencia DATE,
origem TEXT,
fonte TEXT,
page TEXT,
total_de_seguidores INTEGER,
novos_seguidores INTEGER,
total_de_engajamentos_da_publicacao INTEGER,
total_de_publicacoes INTEGER
);
"""


def create_table(query):
    try:
        with engine.connect() as conn:
            conn.execute(text(query))
            print("tabela criada")

    except Exception as e:
        print("erro ao criar tabela")
        print(e)


create_table(query)

tabela criada


In [37]:
def ingest_data(df, table_name):
    try:
        df.to_sql(
            table_name,
            con=engine,
            if_exists="append",
            index=False,
        )
        print("Dados inseridos com sucesso!")
    except Exception as e:
        print(e)


ingest_data(df, "linkedin_concorrentes_concorrentes")

Dados inseridos com sucesso!
