# ETL Pipeline - Uber Dataset

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.types import TIMESTAMP, CHAR, Enum, FLOAT, INTEGER, VARCHAR
import logging
import time
import os

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Etapa 0 - Criando Função de Conexão com o Banco

In [2]:
def connect_to_postgres():
    db_user = os.getenv('POSTGRES_USER', 'admin')
    db_password = os.getenv('POSTGRES_PASSWORD', 'admin')
    db_name = os.getenv('POSTGRES_DB', 'postgres')
    db_host = 'postgres' 
    
    conn_string = f"postgresql://{db_user}:{db_password}@{db_host}/{db_name}"
    
    retries = 5
    while retries > 0:
        try:
            logging.info("Tentando conectar ao Postgres...")
            engine = create_engine(conn_string)
            connection = engine.connect()
            logging.info("Conexão com o Postgres estabelecida com sucesso!")
            return engine, connection
        except Exception as e:
            logging.error(f"Falha ao conectar: {e}")
            retries -= 1
            logging.info(f"Tentando novamente em 5 segundos... ({retries} tentativas restantes)")
            time.sleep(5)
    
    logging.critical("Não foi possível conectar ao banco de dados.")
    return None, None

# Etapa 1 - Extract

In [3]:
engine, connection = connect_to_postgres()

if not engine:
    raise RuntimeError("Não foi possível conectar ao banco de dados. Abortando o ETL.")

logging.info("Iniciando Etapa 1: Extract")
raw_path = '/raw/uber-dataset.csv'
df_raw = pd.read_csv(raw_path)
logging.info(f"{len(df_raw)} linhas lidas do arquivo CSV.")

2025-10-10 00:10:28,999 - INFO - Tentando conectar ao Postgres...


2025-10-10 00:10:29,052 - INFO - Conexão com o Postgres estabelecida com sucesso!


2025-10-10 00:10:29,053 - INFO - Iniciando Etapa 1: Extract


2025-10-10 00:10:29,680 - INFO - 150000 linhas lidas do arquivo CSV.


# Etapa 2 - Transform

In [None]:
logging.info("Iniciando Etapa 2: Transform")
df = df_raw.copy()
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

df['date_time'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')

df['booking_id'] = df['booking_id'].str.strip('"')
df['customer_id'] = df['customer_id'].str.strip('"')

numeric_cols = ['avg_vtat', 'avg_ctat', 'booking_value', 'ride_distance', 'driver_ratings', 'customer_rating']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


conditions = [
    df['cancelled_rides_by_customer'].notna(),
    df['cancelled_rides_by_driver'].notna()
]
choices = ['customer', 'driver']
df['cancelled_by'] = np.select(conditions, choices, default=None)

df['reason_for_cancelling'] = df['reason_for_cancelling_by_customer'].fillna(df['driver_cancellation_reason'])

final_columns = {
    'date_time': 'date_time',
    'booking_id': 'booking_id',
    'booking_status': 'booking_status',
    'customer_id': 'customer_id',
    'vehicle_type': 'vehicle_type',
    'pickup_location': 'pickup_location',
    'drop_location': 'drop_location',
    'avg_vtat': 'avg_vtat',
    'avg_ctat': 'avg_ctat',
    'cancelled_by': 'cancelled_by',
    'reason_for_cancelling': 'reason_for_cancelling',
    'incomplete_rides_reason': 'incomplete_ride_reason',
    'booking_value': 'booking_value',
    'ride_distance': 'ride_distance',
    'driver_ratings': 'driver_rating', 
    'customer_rating': 'customer_rating',
    'payment_method': 'payment_method'
}

# Seleciona apenas as colunas que vamos usar
df_final = df[final_columns.keys()]

# Renomeia as colunas para o padrão final
df_final = df_final.rename(columns=final_columns)


df_final['booking_value'] = df_final['booking_value'].astype('Int64')

logging.info("Transformação concluída. Schema final do DataFrame:")
print(df_final.info())
print(df_final.head())


2025-10-10 00:10:29,690 - INFO - Iniciando Etapa 2: Transform


2025-10-10 00:10:29,754 - INFO - Limpando aspas dos IDs...


2025-10-10 00:10:29,941 - INFO - Transformação concluída. Schema final do DataFrame:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   date_time               150000 non-null  datetime64[ns]
 1   booking_id              150000 non-null  object        
 2   booking_status          150000 non-null  object        
 3   customer_id             150000 non-null  object        
 4   vehicle_type            150000 non-null  object        
 5   pickup_location         150000 non-null  object        
 6   drop_location           150000 non-null  object        
 7   avg_vtat                139500 non-null  float64       
 8   avg_ctat                102000 non-null  float64       
 9   cancelled_by            37500 non-null   object        
 10  reason_for_cancelling   37500 non-null   object        
 11  incomplete_ride_reason  9000 non-null    object        
 12  booking_value           102000

# Etapa 3 - Load

In [None]:
logging.info("Iniciando Etapa 3: Load")

sql_types = {
    'date_time': TIMESTAMP,
    'booking_id': CHAR(10),
    'booking_status': Enum('No Driver Found', 'Incomplete', 'Completed', 'Cancelled by Driver', 'Cancelled by Customer', name='booking_status_enum'),
    'customer_id': CHAR(10),
    'vehicle_type': Enum('eBike', 'Go Sedan', 'Auto', 'Premier Sedan', 'Bike', 'Go Mini', 'Uber XL', name='vehicle_type_enum'),
    'pickup_location': VARCHAR(255),
    'drop_location': VARCHAR(255),
    'avg_vtat': FLOAT,
    'avg_ctat': FLOAT,
    'cancelled_by': Enum('customer', 'driver', 'none', name='cancelled_by_enum'),
    'reason_for_cancelling': Enum('Driver is not moving towards pickup location', 'Driver asked to cancel', 'AC is not working', 'Change of plans', 'Wrong Address', 'Personal & Car related issues', 'Customer related issue', 'More than permitted people in there', 'The customer was coughing/sick', name='cancellation_reason_enum'),
    'incomplete_rides_reason': Enum('Vehicle Breakdown', 'Other Issue', 'Customer Demand', name='incomplete_reason_enum'),
    'booking_value': INTEGER,
    'ride_distance': FLOAT,
    'driver_rating': FLOAT,
    'customer_rating': FLOAT,
    'payment_method': Enum('UPI', 'Debit Card', 'Cash', 'Uber Wallet', 'Credit Card', name='payment_method_enum')
}

try:
    table_name = 'uber_silver' 
    df_final.to_sql(table_name, engine, if_exists='replace', index=False, dtype=sql_types)
    logging.info(f"Dados carregados com sucesso na tabela '{table_name}'!")
except Exception as e:
    logging.error(f"Erro ao carregar dados no banco: {e}")
finally:
    connection.close()
    logging.info("Conexão com o banco de dados fechada.")


2025-10-10 00:10:29,997 - INFO - Iniciando Etapa 3: Load


2025-10-10 00:10:41,194 - INFO - Dados carregados com sucesso na tabela 'uber_silver'!


2025-10-10 00:10:41,195 - INFO - Conexão com o banco de dados fechada.
