In [None]:
def get_database_configuration(database_type = 'mysql', host = None, port = None, database = None, table = None, user = None, password = None, input_collection = None, output_collection = None):
    databases = {
        'mongodb': {
            'app_name': 'MongoDB_Connector',
            'format_type':'mongodb',
            'host': host if database_type == 'mongodb' and host else 'host.docker.internal',
            'port': port if database_type == 'mongodb' and port else 27017,
            'user': user if database_type == 'mongodb' and user else 'admin',
            'password': password if database_type == 'mongodb' and password else '12345678',
            'database': database if database_type == 'mongodb' and database else 'spark',
            'input_collection':  input_collection if database_type == 'mongodb' and table else 'users',
            'output_collection': output_collection if database_type == 'mongodb' and table else 'users',
            'driver': 'com.mongodb.spark.sql.DefaultSource',
            'url': f"mongodb://{user}:{password}@{host}:{port}" if database_type == 'mongodb' and host and port else 'mongodb://admin:12345678@host.docker.internal:27017'
            },
        'postgres': {
            'app_name': 'PostgreSQL_Connector',
            'format_type':'jdbc',
            'host': host if database_type == 'postgres' and host else 'host.docker.internal',
            'port': port if database_type == 'postgres' and port else 5432,
            'user': user if database_type == 'postgres' and user else 'admin',
            'password': password if database_type == 'postgres' and password else '12345678',
            'database': database if database_type == 'postgres' and database else 'spark',
            'table': table if database_type == 'postgres' and table else 'users',
            'schema': 'public',
            'spark_jars': '/usr/local/spark/jars/postgresql-42.7.1.jar',
            'driver': 'org.postgresql.Driver',
            'url': f"jdbc:postgresql://{host}:{port}/{database}" if database_type == 'postgres' and host and port else 'jdbc:postgresql://host.docker.internal:5432/spark',
            'properties': {
                'user': user if database_type == 'postgres' and user else 'admin',
                'password': password if database_type == 'postgres' and password else '12345678',
                'driver': 'org.postgresql.Driver'
                            }
            },
        'mysql': {
            'app_name': 'MySQL_Connector',
            'format_type':'jdbc',
            'host': host if database_type == 'mysql' and host else 'host.docker.internal',
            'port': port if database_type == 'mysql' and port else 3306,
            'user': user if database_type == 'mysql' and user else 'admin',
            'password': password if database_type == 'mysql' and password else '12345678',
            'database': database if database_type == 'mysql' and database else 'spark',
            'table': table if database_type == 'mysql' and table else 'users',
            'spark_jars': '/usr/local/spark/jars/mysql-connector-j-8.2.0.jar',
            'driver': 'com.mysql.cj.jdbc.Driver',
            'url': f"jdbc:mysql://{host}:{port}/{database}" if database_type == 'mysql' and host and port else 'jdbc:mysql://host.docker.internal:3306/spark',
            'properties': { 
                            'user': user if database_type == 'mysql' and user else 'admin', 
                            'password': password if database_type == 'mysql' and password else '12345678', 
                            'driver': 'com.mysql.cj.jdbc.Driver'
            }
                            }
    }
    return databases.get(database_type.lower(), databases.get('mysql'))

def generate_sample_data():
    data = [(1, "Ana"), (2, "Cecilia"), (3, "Nathalie"), (4, "Diana"), (5, "Gabriela"), (6, "Angela"), (7, "Tatiana"), (8, "Lucia"), (9, "Maria")]
    columns = ["Id", "Name"]
    return data, columns

def insert_data_to_database(database_configuration, database_type=None):
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import monotonically_increasing_id
    
    app_name = database_configuration.get('app_name')
    format_type = database_configuration.get('format_type')
    database = database_configuration.get('database')
    user = database_configuration.get('user')
    password = database_configuration.get('password')
    driver = database_configuration.get('driver')
    url = database_configuration.get('url')
    spark_session = SparkSession.builder.master('local').appName(app_name)
    
    data, columns = generate_sample_data()

    try:
        message = f'Records Inserted Successfully in {app_name}'
        if database_type == 'mongodb':
            collection = database_configuration.get('output_collection')
            spark_session = spark_session.getOrCreate()
            sampleDF = spark_session.createDataFrame(data, columns)
            sampleDF_with_id = sampleDF.withColumn("id", monotonically_increasing_id()) # add column 'id' to DataFrame

            sampleDF_with_id.write.format("mongodb") \
            .option("connection.uri", url).option("database", database) \
            .option("collection", collection).mode("append").save()
        else: 
            dbtable = database_configuration.get('table')
            spark_jars = database_configuration.get('spark_jars')
            spark_session = spark_session.config("spark.jars", spark_jars)
            spark_session = spark_session.config("spark.jars", spark_jars).getOrCreate()
            sampleDF = spark_session.createDataFrame(data, columns)
            sampleDF.write \
                .format(format_type).option("driver", driver) \
                .option("url", url).option("dbtable", dbtable) \
                .option("user", user).option("password", password) \
                .mode("ignore").mode("append").save()
    except Exception as e:
        message = f"Error inserting data: {str(e)}"
    finally:
        spark_session.stop() # stop Spark session
        return message

In [None]:
mysql_configuration = get_database_configuration(database_type = 'mysql')
insert_data_to_database(mysql_configuration)

In [None]:
postgres_configuration = get_database_configuration(database_type = 'postgres')
insert_data_to_database(postgres_configuration)

In [None]:
mongodb_configuration = get_database_configuration(database_type = 'mongodb')
insert_data_to_database(database_configuration=mongodb_configuration, database_type = 'mongodb')

In [None]:
def read_data_from_database(database_type='mysql', host=None, port=None, database=None, table=None, user=None, password=None, input_collection=None, output_collection=None):
    database_configuration = get_database_configuration(database_type=database_type, host=host, port=port, database=database, table=table, user=user, password=password, input_collection=input_collection, output_collection=output_collection)
    from pyspark.sql import SparkSession
    spark_session =  SparkSession.builder.master('local').appName(f'Read data from {database_type}').getOrCreate()
    properties = database_configuration.get('properties')
    url = database_configuration.get('url')
    try:
        if database_type == 'mongodb':
            database = database_configuration.get('database')
            collection = database_configuration.get('input_collection')
            result = spark_session.read.format("mongodb").option("connection.uri", url).option("database", database).option("collection", collection).load()
        else:
            table = database_configuration.get('table')
            if database_configuration.get('schema'):
                table = f"{database_configuration.get('schema')}.{database_configuration.get('table')}"
            result = spark_session.read.jdbc(url=url, table=table, properties=properties)
    except Exception as e:
        print(f"Error reading data: {str(e)}")
        result = None  # Another action you may want to take in case of an exception
    finally:
        result.printSchema()  # Print schema
        result.show()  # Show rows
        df = result.toPandas() # converts to pandas
        spark_session.stop()
        return df

In [None]:
read_data_from_database(database_type = 'mysql')

In [None]:
read_data_from_database(database_type = 'postgres')

In [None]:
read_data_from_database(database_type = 'mongodb')

# EXTENSION SQL

# POSTGRES

In [None]:
%load_ext sql
%sql postgresql://admin:12345678@host.docker.internal:5432/spark

In [None]:
%%sql query_result <<
SELECT * 
    FROM users 
LIMIT 10;

In [None]:
data = query_result.DataFrame()
data

# MySQL

In [None]:
%load_ext sql
%sql mysql://admin:12345678@host.docker.internal:3306/spark
result = %sql SELECT * FROM users LIMIT 10;
data = result.DataFrame()
data

# DUCK DB

In [None]:
from pathlib import Path
from urllib.request import urlretrieve

if not Path("penguins.csv").is_file():
    urlretrieve("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
                "penguins.csv")

%load_ext sql
%config SqlMagic.displaylimit = 5
%sql duckdb://

In [None]:
%%sql
SELECT *
FROM penguins.csv
LIMIT 3

In [None]:
%sql sqlite:// --alias second-db

In [None]:
%%sql duckdb:// --save adelie
SELECT *
FROM penguins.csv
WHERE species = 'Adelie'

In [None]:
%%sql --save not_nulls --no-execute
SELECT *
FROM penguins.csv
WHERE bill_length_mm IS NOT NULL
AND bill_depth_mm IS NOT NULL


In [None]:
%sqlplot boxplot --column bill_length_mm bill_depth_mm --table not_nulls --with not_nulls

In [None]:
%sqlplot histogram --column bill_length_mm bill_depth_mm --table not_nulls --with not_nulls