In [12]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [13]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType
# from pyspark.context import SparkContext
# from pyspark.sql.session import SparkSession

# sc = SparkContext.getOrCreate()
# spark = SparkSession(sc)

In [14]:
sys.executable

'c:\\Users\\Nikos\\Desktop\\analytics_project\\Advanced_Analytics\\.venv\\Scripts\\python.exe'

In [15]:
sc

In [16]:
spark

In [17]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Toy predict function. Normally you'd use your loaded globals()['my_model'] here
def predict(df):
    return 'predicted-name-of-channel'

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Utilize our predict function
    df_withpreds = df.withColumn("pred", predict_udf(
        struct([df[x] for x in df.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [18]:
ssc = StreamingContext(sc, 10)

In [19]:
lines = ssc.socketTextStream("localhost", 8080)
lines.foreachRDD(process)

In [20]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+-------+--------------------+--------------------+------------------+
|channel|            datetime|             message|          username|
+-------+--------------------+--------------------+------------------+
|   #pgl|2022-05-20T16:01:...|                KEKW|   ender_chocolate|
|   #pgl|2022-05-20T16:01:...|                KEKW|            nirosb|
|   #pgl|2022-05-20T16:01:...|SIUUUUUUUUUUUUUUU...|          wuzzibey|
|   #pgl|2022-05-20T16:01:...|                KEKW|            hexzor|
|   #pgl|2022-05-20T16:01:...|                 LUL|        filipland_|
|   #pgl|2022-05-20T16:01:...|                KEKW|chaczapuri_imeruli|
|   #pgl|2022-05-20T16:01:...|                KEKW|         bezeball8|
|   #pgl|2022-05-20T16:01:...|                KEKW|           joo1ius|
|   #pgl|2022-05-20T16:01:...|                 LUL|      flameboltirl|
|   #pgl|2022-05-20T16:01:...|           who that?|     coloneloscopi|
|   #pgl|2022-05-20T16:01:...|                KEKW|           mrtatt1|
|   #p

In [21]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+-------+--------------------+--------------------+--------------------+
|channel|            datetime|             message|            username|
+-------+--------------------+--------------------+--------------------+
|   #pgl|2022-05-20T16:02:...|      This crowd man|         kanesy_1872|
|   #pgl|2022-05-20T16:02:...|       goooo AAAAAAA|     nandkishorkalal|
|   #pgl|2022-05-20T16:02:...|                KEKW|            poloolpp|
|   #pgl|2022-05-20T16:02:...|     ResidentSleeper|      united_klngdom|
|   #pgl|2022-05-20T16:02:...| AHHAHHHAAHAHHHHHHHA|           jota23152|
|   #pgl|2022-05-20T16:02:...|                KEKW|           sht0rmbtw|
|   #pgl|2022-05-20T16:02:...|   TEAM SPIRIT TOP 1|       matvei2007tmb|
|   #pgl|2022-05-20T16:02:...|     ResidentSleeper|            mrdivago|
|   #pgl|2022-05-20T16:02:...|         snappi kekw|               sevix|
|   #pgl|2022-05-20T16:02:...|      FRENCH WutFace|        bobbinrobbin|

In [22]:
print('completed')

completed
