In [1]:
from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes, CsvTableSink, ChangelogMode, Schema, TableDescriptor, WriteMode

In [2]:
from pyflink.table.window import Tumble, Slide

In [3]:
from pyflink.table.expressions import col as col_fk

In [4]:
from pyflink.table.expressions import lit as lit_fk

In [5]:
from pyflink.datastream import StreamExecutionEnvironment

In [6]:
stream_env = StreamExecutionEnvironment.get_execution_environment()

In [7]:
env_set = EnvironmentSettings.new_instance()\
                            .in_streaming_mode()\
                            .build()

In [8]:
stream_tbl_env = StreamTableEnvironment.create(stream_execution_environment=stream_env,
                                               environment_settings=env_set)

In [9]:
stream_tbl_env.get_config().get_configuration().set_string("parallelism.default", "1")

<pyflink.common.configuration.Configuration at 0x1a51409e820>

In [10]:
stream_env.set_parallelism(1)

<pyflink.datastream.stream_execution_environment.StreamExecutionEnvironment at 0x1a57132eb50>

In [11]:
kafka_tbl_sch = Schema.new_builder()\
                    .column('seller_id', DataTypes.STRING())\
                    .column('product', DataTypes.STRING())\
                    .column('quantity', DataTypes.INT())\
                    .column('product_price', DataTypes.DOUBLE())\
                    .column('sale_ts', DataTypes.BIGINT())\
                    .column_by_expression('proct', 'PROCTIME()')\
                    .build()

In [12]:
kafka_tbl_dis = TableDescriptor.for_connector('kafka')\
                                    .schema(schema=kafka_tbl_sch)\
                                    .option('topic','salesitems')\
                                    .option('properties.group.id','source-demo-1')\
                                    .option('properties.bootstrap.servers','localhost:9092')\
                                    .option('scan.startup.mode','earliest-offset')\
                                    .format('json')\
                                    .option('json.ignore-parse-errors','true')\
                                    .build()

In [13]:
stream_tbl_env.create_table('kafka_tbl_source', kafka_tbl_dis)

In [14]:
kafka_tbl_source = stream_tbl_env.scan('kafka_tbl_source')

In [15]:
kafka_tbl_source = kafka_tbl_source.window(Slide.over(lit_fk(30).seconds).every(lit_fk(10).seconds).on(col_fk('proct')).alias('proct_w'))\
                                .group_by(col_fk('proct_w'),col_fk('seller_id'))\
                                .select(col_fk('seller_id'), 
                                        col_fk('proct_w').start.alias('proct_start'),
                                        col_fk('proct_w').end.alias('proct_end'),
                                        (col_fk('quantity')*col_fk('product_price')).sum.alias('proct_sales'))

In [16]:
kafka_ds_source = stream_tbl_env.to_changelog_stream(table=kafka_tbl_source, 
                                                     changelog_mode=ChangelogMode.all())

In [17]:
kafka_tbl_source = stream_tbl_env.from_data_stream(kafka_ds_source)

In [18]:
csv_tbl_sink = CsvTableSink(field_names=kafka_tbl_source.get_schema().get_field_names(), 
                            field_types=kafka_tbl_source.get_schema().get_field_data_types(),
                            path='./WindowCSV/sum_prices_1.csv',
                            num_files=1,
                            write_mode=WriteMode.OVERWRITE)

In [19]:
stream_tbl_env.register_table_sink('csv_tbl_sink',csv_tbl_sink)

In [20]:
kafka_tbl_source.execute_insert('csv_tbl_sink')

<pyflink.table.table_result.TableResult at 0x1a5141c0610>