# INICIAR EL CONTENEDOR QUE YA TIENE TODAS LAS DEPENDENCIAS DE SPARK Y PYSPARK

In [None]:
# docker run --name labmultilanguage -p 8888:8888 -p 4040:4040 -p 5006:5006 -p 3000:3000 jorgecardona/jupyterlabmultilanguages:v1

# descargar la ultima version del conector de MONGO para conectar pyspark con postgres
### https://jdbc.postgresql.org/download/

### asumiendo que la ultima version del conector es postgresql-42.6.0.jar

# copiar el conector para conectarse con pyspark, dentro de la carpeta de jars de spark
cp postgresql-42.6.0.jar /usr/local/spark/jars

# PROBAR CONEXION A UNA BASE DE DATOS LOCAL DESDE UN CONTENEDOR
## USAR host = "host.docker.internal" EN VEZ DE host = "localhost"

In [1]:
# !pip install psycopg2
import psycopg2

def ping_database(host, port, database, user, password):
    try:
        # Intentar conectarse a la base de datos
        connection = psycopg2.connect(
            host=host,
            port=port,
            database=database,
            user=user,
            password=password
        )
        connection.close()
        print("La conexión fue exitosa. El servidor PostgreSQL está accesible.")
    except Exception as e:
        print(f"Error al conectar a la base de datos: {e}")

# Configurar las credenciales de PostgreSQL
host = "host.docker.internal"
port = "5432"  # Puerto predeterminado para PostgreSQL
database = "postgres"
user = "postgres"
password = '12345678'

# Realizar la conexión de prueba
ping_database(host, port, database, user, password)

La conexión fue exitosa. El servidor PostgreSQL está accesible.


#  CREAR UNA SESION EN SPARK

In [2]:
from pyspark.sql import SparkSession

ruta_ubicacion_driver_conector_postgresql = "/usr/local/spark/jars/postgresql-42.6.0.jar"

# Configurar la sesión de Spark
spark = SparkSession.builder \
    .appName("Conexion_PostgreSQL") \
    .config("spark.jars", ruta_ubicacion_driver_conector_postgresql) \
    .getOrCreate()

23/07/21 04:17:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# si se quiere conectar desde un conetenedor a un recurso local, se debe usar host.docker.internal en vez de localhost
def leer_datos_postgres_convertir_a_pandas(instancia_spark, tabla, schema="public"):
    
    # parametros para la conexión a la base de datos
    # si se quiere conectar desde un conetenedor a un recurso local, se debe usar host.docker.internal en vez de localhost
    host = "host.docker.internal"
    puerto = "5432"
    database = "postgres"

    # Configurar las credenciales de PostgreSQL
    url = f"jdbc:postgresql://{host}:{puerto}/{database}"
    
    properties = {
        "user": "postgres",
        "password": "12345678",
        "driver": "org.postgresql.Driver"
    }

    # Convertir todos los valores a cadenas
    properties = {k: str(v) for k, v in properties.items()}

    final_query = f"{schema}.{tabla}"
    
    spark_dataframe = instancia_spark.read.jdbc(url=url, table=final_query, properties=properties)
    return spark_dataframe.toPandas()

# CREAR LOS DATAFRAMES DE PANDAS

In [6]:
df1 = leer_datos_postgres_convertir_a_pandas(spark, "vuelos_1")
df2 = leer_datos_postgres_convertir_a_pandas(spark, "vuelos_2")

                                                                                

# EXPLORAR EL CONTENIDO DE LOS DATAFRAMES

In [7]:
df1.head()

Unnamed: 0,id,secure_code,airline,departure_city,departure_date,arrival_airport,arrival_city,arrival_time,passenger_name,passenger_gender,seat_number,currency,departure_gate,flight_status,co_pilot_name,aircraft_type,fuel_consumption
0,1,01H4EEMGMG9VADVF06JZGJJGN0,EasyFly,Berlin,25/12/2022,PEI,Pereira,27/12/2022 14:13,Nathalie Cardona,Female,A1,EUR,B2,On Time,Hart Blunkett,Embraer E190,7916.39
1,2,01H4EEMGMYP8GX4GRC4Y2MPYH5,Delta,Les Sables-d'Olonne,4/1/2022,YHU,Westport,6/7/2023 06:53,Willie Childrens,Female,B2,EUR,A1,Delayed,Leanor Gribbins,Airbus A320,9666.36
2,3,01H4EEMGN3BZJ9RR779KDEB2WG,United,Oyo,3/4/2022,KWJ,Jabon,6/7/2023 03:44,Fifine Luten,Female,B2,NGN,C3,On Time,Christie Wakeley,Boeing 737,8047.44
3,4,01H4EEMGN9DFF5XJC1QPSRQCTR,Delta,Kuragaki-kosugi,31/5/2022,ANP,Xianyuan,6/7/2023 18:56,Doll Sommerscales,Female,C3,JPY,A1,Delayed,Mia Vannah,Airbus A320,5156.19
4,5,01H4EEMGNFX6T41V6RSST9YVPF,Delta,Ko Pha Ngan,10/7/2022,QUB,Sovetskaya Gavan’,6/7/2023 16:55,Norman Crosen,Male,A1,THB,A1,On Time,Barn Timmes,Boeing 737,7584.07


In [8]:
df2.head()

Unnamed: 0,flight_id,flight_number,departure_airport,departure_country,departure_time,arrival_country,arrival_date,flight_duration,passenger_age,passenger_nationality,ticket_price,baggage_weight,arrival_gate,pilot_name,cabin_crew_count,aircraft_registration,flight_distance
0,1,1978,CFQ,Germany,6/7/2023 04:42,Colombia,27/12/2022,14.18,0,Colombia,797.24,43.85,E5,Sunny Few,9,N12345,1400.24
1,2,2337,ONG,France,6/7/2023 17:31,New Zealand,23/12/2022,13.54,29,Sweden,383.63,35.78,D4,Donielle Strut,4,N12345,465.84
2,3,7588,TEX,Nigeria,6/7/2023 11:08,Indonesia,30/9/2022,6.28,34,Argentina,439.09,47.81,D4,Shelly Paddefield,3,N67890,3151.78
3,4,7545,ORB,Japan,6/7/2023 11:11,China,14/10/2022,5.6,34,China,706.19,25.79,D4,Babara Kretschmer,6,N12345,264.64
4,5,4553,WEW,Thailand,6/7/2023 10:40,Russia,13/10/2022,14.71,40,Uruguay,906.66,11.6,E5,Bert Mathison,5,N67890,2158.97


In [9]:
# terminar sesión spark
spark.stop()