In [1]:
from pyflink.datastream.connectors.kafka import KafkaSource, KafkaOffsetResetStrategy, KafkaOffsetsInitializer
from pyflink.common.serialization import SimpleStringSchema
from pyflink.common.watermark_strategy import WatermarkStrategy
from pyflink.datastream.stream_execution_environment import StreamExecutionEnvironment
from pyflink.table import EnvironmentSettings, TableEnvironment
from pyflink.table.table_descriptor import TableDescriptor
from pyflink.table.schema import Schema
from pyflink.table.types import DataTypes
from faker import Faker
import os

In [5]:
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
CURRENT_DIR = os.getcwd()
table_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///" + CURRENT_DIR + "/jdbc/flink-sql-connector-kafka-3.1.0-1.18.jar",
)

<pyflink.common.configuration.Configuration at 0x78b53ae19eb0>

In [6]:
table_env.create_temporary_table(
    'kafka_user1',
    TableDescriptor.for_connector('kafka')
        .schema(Schema.new_builder()
                .column('birthdate', DataTypes.STRING())
                .column('blood_group', DataTypes.STRING())
                .column('job', DataTypes.STRING())
                .column('name', DataTypes.STRING())
                .column('residence', DataTypes.STRING())
                .column('sex', DataTypes.STRING())
                .column('ssn', DataTypes.STRING())
                .column('uuid', DataTypes.STRING())
                .build())
        .option('properties.bootstrap.servers', 'kafka1:19091,kafka1:19092,kafka1:19093')
        .option('topic', 'user')
        .option('scan.startup.mode', 'earliest-offset')
        .option('value.format', 'json')
        .build())

table = table_env.from_path("kafka_user1")

In [1]:
from pyflink.table import TableEnvironment, EnvironmentSettings

# Create a TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
t_env = TableEnvironment.create(env_settings)

# Specify connector and format jars
t_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///workspace/spark/jdbc/flink-sql-connector-kafka-3.1.0-1.18.jar"
)

<pyflink.common.configuration.Configuration at 0x73513f4547c0>

In [2]:
source_ddl = """
    CREATE TABLE temp_table(
        birthdate VARCHAR,
        blood_group VARCHAR,
        job VARCHAR,
        name VARCHAR,
        sex VARCHAR,
        ssn VARCHAR,
        uuid VARCHAR,
        proctime AS PROCTIME()
    ) WITH (
        'connector' = 'kafka',
        'topic' = 'hyunsoo',
        'properties.bootstrap.servers' = 'kafka1:19091,kafka1:19092,kafka1:19093',
        'properties.group.id' = 'G1',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'json'
    )
"""

t_env.execute_sql(source_ddl)

# Retrieve the source table
source_table = t_env.from_path('temp_table')

print('\nSource Schema')
source_table.print_schema()


Source Schema


In [3]:
sql = """
    SELECT
      blood_group,
      COUNT(blood_group) AS blood_add,
      TUMBLE_END(proctime, INTERVAL '20' SECONDS) AS window_end
    FROM temp_table
    GROUP BY
      TUMBLE(proctime, INTERVAL '20' SECONDS),
      blood_group
"""
process_tbl = t_env.sql_query(sql)

print('\nProcess Sink Schema')
process_tbl.print_schema()


Process Sink Schema


In [4]:
sink_ddl = """
    CREATE TABLE sink_table(
        blood_group VARCHAR,
        blood_add BIGINT,
        window_end TIMESTAMP(3)
    ) WITH (
        'connector' = 'kafka',
        'topic' = 'sink',
        'properties.bootstrap.servers' = 'kafka1:19091,kafka1:19092,kafka1:19093',
        'format' = 'json'
    )
"""
t_env.execute_sql(sink_ddl)

# write time windowed aggregations to sink table
process_tbl.execute_insert('sink_table').wait()

t_env.execute('windowed-sink-table')

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1217, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
source_ddl = """
    CREATE TABLE temp_table(
        birthdate VARCHAR,
        blood_group VARCHAR,
        job VARCHAR,
        name VARCHAR,
        sex VARCHAR,
        ssn VARCHAR,
        uuid VARCHAR
    ) WITH (
        'connector' = 'kafka',
        'topic' = 'hyunsoo',
        'properties.bootstrap.servers' = 'kafka1:19091,kafka1:19092,kafka1:19093',
        'properties.group.id' = 'G1',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'json'
    )
"""

t_env.execute_sql(source_ddl)

# Retrieve the source table
source_table = t_env.from_path('temp_table')

print('\nSource Schema')
source_table.print_schema()

In [3]:
sink_ddl = """
    CREATE TABLE sink_table(
        birthdate VARCHAR,
        blood_group VARCHAR,
        job VARCHAR,
        name VARCHAR,
        sex VARCHAR,
        ssn VARCHAR,
        uuid VARCHAR
    ) WITH (
        'connector' = 'kafka',
        'topic' = 'sink',
        'properties.bootstrap.servers' = 'kafka1:19091,kafka1:19092,kafka1:19093',
        'format' = 'json'
    )
"""
t_env.execute_sql(sink_ddl)

# write time windowed aggregations to sink table
source_table.execute_insert('sink_table').wait()

t_env.execute('windowed-sink-table')

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py", line 1217, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 