# ITABLES

In [None]:
# to enable itables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# PYSPARK

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("JorgeCardonaSpark").getOrCreate()

# Perform a simple DataFrame operation
data = [('Nathalie', 0), ('Ana', 3), ('Diana', 7), ('Lucia', 10), ('Tatiana', 13), ('Angela', 17), ('Cecilia', 25), ('Alice', 31), ('Kristin', 35), ('Carolina', 37), ('Lina', 39), ('Marcela', 40), ('Maria', 42)]

# Create a Dataframe
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()
spark.stop()

# PANDAS

In [None]:
import pandas as pd

data = {
    'Name': ["Nathalie", "Ana", "Diana", "Lucia", "Tatiana", "Angela", "Cecilia", "Alice", "Kristin", "Carolina", "Lina", "Marcela", "Maria"],
    'Age': [0, 3, 7, 10, 13, 17, 25, 31, 35, 37, 39, 40, 42]
}
df = pd.DataFrame(data)
df

# APACHE BEAM

In [None]:
import apache_beam as beam

def regular_case_function(element):
    return element.lower()

def to_uppercase_function(element):
    return element.upper()

def calculate_length_function(element):
    return len(element)

def calculate_square_function(element):
    return element ** 2

# Create a pipeline
with beam.Pipeline() as pipeline:
    # Prepare a list of names to be processed
    names_list = ["Nathalie", "Ana", "Diana", "Lucia", "Tatiana", "Angela", "Cecilia", "Alice", "Kristin", "Carolina", "Lina", "Marcela", "Maria"]

    # Create a PCollection with the given data
    data = pipeline | beam.Create(names_list)

    # Apply transformation functions to the data
    regular_case_data = data | beam.Map(regular_case_function) # Transform to lowercase
    uppercase_data = data | beam.Map(to_uppercase_function) # Transform to uppercase
    length_data = data | beam.Map(calculate_length_function) # Apply transformation to calculate the length of each name
    square_data = length_data | beam.Map(calculate_square_function) # Apply transformation to calculate the square

    # Print the results of each transformation
    length_data | "Show_Length" >> beam.Map(print) # Print length results
    regular_case_data | "Show_Lowercase" >> beam.Map(print) # Print lowercase results
    uppercase_data | "Show_Uppercase" >> beam.Map(print) # Print uppercase results
    square_data | "Show_Square" >> beam.Map(print) # Print square results
    combined_data = (length_data, regular_case_data, uppercase_data, square_data) | beam.Flatten()
    combined_data | "Show_All" >> beam.Map(print)

# FAKER

In [None]:
from faker import Faker
fake = Faker()
name = fake.name()
print(name)

# PANEL

In [None]:
import panel as pn

def model(n=5):
    return "⭐"*n

pn.extension()

slider = pn.widgets.IntSlider(value=5, start=1, end=5)

interactive_model = pn.bind(model, n=slider)

layout = pn.Column(slider, interactive_model)

app = pn.serve(layout, port=5006, show=True)

#app.stop()

In [None]:
app.stop()

# SEABORN

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the "tips" dataset from Seaborn
tips = sns.load_dataset("tips")

# Create a scatter plot with Matplotlib
plt.figure(figsize=(10, 7))
plt.scatter(x='total_bill', y='tip', data=tips, alpha=0.7)
plt.title('Scatter Plot of Total Bill vs Tip')
plt.xlabel('Total Bill')
plt.ylabel('Tip')
plt.show()

# BOKEH

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
import seaborn as sns

# Load the "tips" dataset from Seaborn
tips = sns.load_dataset("tips")

# Enable output in the notebook
output_notebook()

# Create a scatter plot using Bokeh with custom size
p = figure(
    title="Scatter Plot of Total Bill vs Tip",
    x_axis_label='Total Bill',
    y_axis_label='Tip',
    width=900, # adjust as needed
    height=500 # adjust as needed
)

# Add the data to the plot
p.circle(x='total_bill', y='tip', source=tips, size=8, color="navy", alpha=0.5)

# Show the plot in the notebook
show(p)

# DIAGRAMS

In [None]:
from diagrams import Diagram
from diagrams.aws.compute import EC2
from diagrams.aws.database import RDS
from diagrams.aws.network import ELB

with Diagram("Grouped Workers", show=False, direction="TB"):
    ELB("lb") >> [
                  EC2("worker1"),
                  EC2("worker2"),
                  EC2("worker3"),
                  EC2("worker4"),
                  EC2("worker5")
                  ] >> RDS("events")

# CODE FOR TESTING CONNECTION TO DATABASES

In [None]:
def test_mongo_connection(host, port, database, collection, user=None, password=None):
    """
    host = "host.docker.internal"  # Replace with the IP address or hostname of your MongoDB server
    port = 27017
    database = "spark"
    collection = "users"
    user = "admin"  # Username (optional, if MongoDB is configured with authentication)
    password = "12345678"  # Password (optional, if MongoDB is configured with authentication)
    # If you want to use authentication, you need to provide credentials
    # mongodb://admin:12345678@localhost:27017
    # test the connection
    test_mongo_connection(host, port, database, collection)
    test_mongo_connection(host, port, database, collection, user, password)
    """
    from pymongo import MongoClient
    try:
        client = MongoClient(host, port, username=user, password=password)
        if not user or not password:
            client = MongoClient(host, port)
        db = client[database]
        # You can use any query here; for example, count_documents({})
        collection_loaded = db[collection]
        result = collection_loaded.find_one()
        if result:
            print("Connection successful. MongoDB server is accessible.", result)
        else:
            print("Connection successful, but no data was found in the database.")
        client.close()
    except Exception as e:
        print(f"Error connecting to the database: {e}")
                
def test_postgres_connection(host, port, database, user, password):
    """
    host = "host.docker.internal"  # Replace with the IP address or hostname of your PostgreSQL server
    port = 5432
    database = "spark"
    user = "admin"
    password = "12345678"
    # test the connection
    test_postgres_connection(host, port, database, user, password)
    """
    import psycopg2
    try:
        connection = psycopg2.connect(host=host, port=port, database=database, user=user, password=password)
        connection.close()
        print("Connection successful. PostgreSQL server is accessible.")
    except Exception as e:
        print(f"Error connecting to the database: {e}")
                
def test_mysql_connection(host, port, database, user, password):
    """
    host = "host.docker.internal"  # Replace with the IP address or hostname of your MySQL server
    port = 3306  # Default port for MySQL
    database = "spark"  # Name of the database you want to connect to
    user = "admin"  # Username
    password = "12345678"  # Password
    # test the connection
    test_mysql_connection(host, port, database, user, password)
    """    
    import mysql.connector
    try:
        connection = mysql.connector.connect(host=host, port=port, database=database, user=user, password=password)
        connection.close()
        print("Connection successful. MySQL server is accessible.")
    except Exception as e:
        print(f"Error connecting to the database: {e}")

# DATABASES CONFIGURATION USING SPARK SESSION

In [None]:
def get_database_configuration(database_type = 'mysql', host = None, port = None, database = None, table = None, user = None, password = None, input_collection = None, output_collection = None):
    databases = {
        'mongodb': {
            'app_name': 'MongoDB_Connector',
            'format_type':'mongodb',
            'host': host if database_type == 'mongodb' and host else 'host.docker.internal',
            'port': port if database_type == 'mongodb' and port else 27017,
            'user': user if database_type == 'mongodb' and user else 'admin',
            'password': password if database_type == 'mongodb' and password else '12345678',
            'database': database if database_type == 'mongodb' and database else 'spark',
            'input_collection':  input_collection if database_type == 'mongodb' and table else 'users',
            'output_collection': output_collection if database_type == 'mongodb' and table else 'users',
            'driver': 'com.mongodb.spark.sql.DefaultSource',
            'url': f"mongodb://{user}:{password}@{host}:{port}" if database_type == 'mongodb' and host and port else 'mongodb://admin:12345678@host.docker.internal:27017'
            },
        'postgres': {
            'app_name': 'PostgreSQL_Connector',
            'format_type':'jdbc',
            'host': host if database_type == 'postgres' and host else 'host.docker.internal',
            'port': port if database_type == 'postgres' and port else 5432,
            'user': user if database_type == 'postgres' and user else 'admin',
            'password': password if database_type == 'postgres' and password else '12345678',
            'database': database if database_type == 'postgres' and database else 'spark',
            'table': table if database_type == 'postgres' and table else 'users',
            'schema': 'public',
            'spark_jars': '/usr/local/spark/jars/postgresql-42.7.1.jar',
            'driver': 'org.postgresql.Driver',
            'url': f"jdbc:postgresql://{host}:{port}/{database}" if database_type == 'postgres' and host and port else 'jdbc:postgresql://host.docker.internal:5432/spark',
            'properties': {
                'user': user if database_type == 'postgres' and user else 'admin',
                'password': password if database_type == 'postgres' and password else '12345678',
                'driver': 'org.postgresql.Driver'
                            }
            },
        'mysql': {
            'app_name': 'MySQL_Connector',
            'format_type':'jdbc',
            'host': host if database_type == 'mysql' and host else 'host.docker.internal',
            'port': port if database_type == 'mysql' and port else 3306,
            'user': user if database_type == 'mysql' and user else 'admin',
            'password': password if database_type == 'mysql' and password else '12345678',
            'database': database if database_type == 'mysql' and database else 'spark',
            'table': table if database_type == 'mysql' and table else 'users',
            'spark_jars': '/usr/local/spark/jars/mysql-connector-j-8.2.0.jar',
            'driver': 'com.mysql.cj.jdbc.Driver',
            'url': f"jdbc:mysql://{host}:{port}/{database}" if database_type == 'mysql' and host and port else 'jdbc:mysql://host.docker.internal:3306/spark',
            'properties': { 
                            'user': user if database_type == 'mysql' and user else 'admin', 
                            'password': password if database_type == 'mysql' and password else '12345678', 
                            'driver': 'com.mysql.cj.jdbc.Driver'
            }
                            }
    }
    return databases.get(database_type.lower(), databases.get('mysql'))

# INSERT DATA - TEST DATABASES

In [None]:
def generate_sample_data():
    data = [(1, "Ana"), (2, "Cecilia"), (3, "Nathalie"), (4, "Diana"), (5, "Gabriela"), (6, "Angela"), (7, "Tatiana"), (8, "Lucia"), (9, "Maria")]
    columns = ["Id", "Name"]
    return data, columns

def insert_data_to_database(database_configuration, database_type=None):
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import monotonically_increasing_id
    
    app_name = database_configuration.get('app_name')
    format_type = database_configuration.get('format_type')
    database = database_configuration.get('database')
    user = database_configuration.get('user')
    password = database_configuration.get('password')
    driver = database_configuration.get('driver')
    url = database_configuration.get('url')
    spark_session = SparkSession.builder.master('local').appName(app_name)
    
    data, columns = generate_sample_data()

    try:
        message = f'Records Inserted Successfully in {app_name}'
        if database_type == 'mongodb':
            collection = database_configuration.get('output_collection')
            spark_session = spark_session.getOrCreate()
            sampleDF = spark_session.createDataFrame(data, columns)
            sampleDF_with_id = sampleDF.withColumn("id", monotonically_increasing_id()) # add column 'id' to DataFrame

            sampleDF_with_id.write.format("mongodb") \
            .option("connection.uri", url).option("database", database) \
            .option("collection", collection).mode("append").save()
        else: 
            dbtable = database_configuration.get('table')
            spark_jars = database_configuration.get('spark_jars')
            spark_session = spark_session.config("spark.jars", spark_jars)
            spark_session = spark_session.config("spark.jars", spark_jars).getOrCreate()
            sampleDF = spark_session.createDataFrame(data, columns)
            sampleDF.write \
                .format(format_type).option("driver", driver) \
                .option("url", url).option("dbtable", dbtable) \
                .option("user", user).option("password", password) \
                .mode("ignore").mode("append").save()
    except Exception as e:
        message = f"Error inserting data: {str(e)}"
    finally:
        spark_session.stop() # stop Spark session
        return message

In [None]:
mysql_configuration = get_database_configuration(database_type = 'mysql')
insert_data_to_database(mysql_configuration)

In [None]:
postgres_configuration = get_database_configuration(database_type = 'postgres')
insert_data_to_database(postgres_configuration)

In [None]:
mongodb_configuration = get_database_configuration(database_type = 'mongodb')
insert_data_to_database(database_configuration=mongodb_configuration, database_type = 'mongodb')

# READ DATA - TEST DATABASES

In [None]:
def read_data_from_database(database_type='mysql', host=None, port=None, database=None, table=None, user=None, password=None, input_collection=None, output_collection=None):
    database_configuration = get_database_configuration(database_type=database_type, host=host, port=port, database=database, table=table, user=user, password=password, input_collection=input_collection, output_collection=output_collection)
    from pyspark.sql import SparkSession
    spark_session =  SparkSession.builder.master('local').appName(f'Read data from {database_type}').getOrCreate()
    properties = database_configuration.get('properties')
    url = database_configuration.get('url')
    try:
        if database_type == 'mongodb':
            database = database_configuration.get('database')
            collection = database_configuration.get('input_collection')
            result = spark_session.read.format("mongodb").option("connection.uri", url).option("database", database).option("collection", collection).load()
        else:
            table = database_configuration.get('table')
            if database_configuration.get('schema'):
                table = f"{database_configuration.get('schema')}.{database_configuration.get('table')}"
            result = spark_session.read.jdbc(url=url, table=table, properties=properties)
    except Exception as e:
        print(f"Error reading data: {str(e)}")
        result = None  # Another action you may want to take in case of an exception
    finally:
        result.printSchema()  # Print schema
        result.show()  # Show rows
        df = result.toPandas() # converts to pandas
        spark_session.stop()
        return df

In [None]:
read_data_from_database(database_type = 'mysql')

In [None]:
read_data_from_database(database_type = 'postgres')

In [None]:
read_data_from_database(database_type = 'mongodb')