In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder\
        .master("local[4]")\
        .appName("Colab_pyspark")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [3]:
sc = spark.sparkContext
sc

In [4]:
# Просмотр текущей конфигурации
from pyspark import SparkConf

SparkConf().getAll()

[('spark.ui.port', '4050'),
 ('spark.master', 'local[4]'),
 ('spark.app.name', 'Colab_pyspark'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.submitTime', '1691655330875'),
 ('spark.ui.showConsoleProgress', 'true')]

In [5]:
!ls -l -h

total 12K
drwxr-xr-x 3 root root 4.0K Aug 10 07:55 files
drwxr-xr-x 1 root root 4.0K Aug  8 13:38 sample_data
drwxr-xr-x 3 root root 4.0K Aug 10 08:02 streaming


In [6]:
pwd

'/content'

In [7]:
df = spark.read.csv("/content/streaming/people.csv", header=True, inferSchema=True, sep=';')

In [8]:
df.head(3)

[Row(person_ID,name,first,last,middle,email,phone,fax,title='3130,"Burks, Rosella ",Rosella,Burks,,BurksR@univ.edu,963.555.1253,963.777.4065,Professor '),
 Row(person_ID,name,first,last,middle,email,phone,fax,title='3297,"Avila, Damien ",Damien,Avila,,AvilaD@univ.edu,963.555.1352,963.777.7914,Professor '),
 Row(person_ID,name,first,last,middle,email,phone,fax,title='3547,"Olsen, Robin ",Robin,Olsen,,OlsenR@univ.edu,963.555.1378,963.777.9262,Assistant Professor')]

In [9]:
from pyspark.sql.types import *

schema = StructType() \
  .add("person_ID",IntegerType()) \
  .add("name",StringType()) \
  .add("first",StringType()) \
  .add("last", StringType()) \
  .add("middle", StringType()) \
  .add("email", StringType()) \
  .add("phone", StringType()) \
  .add("fax", StringType()) \
  .add("title", StringType())



In [10]:
def killAll():
  for active_stream in spark.streams.active:
        print("Stopping %s by killAll" % active_stream)
        active_stream.stop()

def console_output(df, freq):
     return df.writeStream \
         .format("console") \
         .outputMode("append") \
         .trigger(processingTime=f"{freq} seconds") \
         .options(truncate=False) \
         .start()


raw_files = spark \
     .readStream \
     .format("csv") \
     .schema(schema) \
     .options(path="/content/streaming/", header=True) \
     .load()



In [11]:
out = console_output(raw_files, 3)

In [12]:
killAll()

Stopping <pyspark.sql.streaming.query.StreamingQuery object at 0x7aa6c61d9600> by killAll
