### Install requirements

In [1]:
# !pip install textblob 
# !pip install pyspark

### Import libraries

In [2]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.linalg import Vectors
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
import json
import requests

# import warnings
# warnings.filterwarnings("ignore")

In [3]:
HOST = "localhost"        
STREAM_PORT = 9999     

sc = SparkContext.getOrCreate()
sc.setCheckpointDir("spark_checkpoint")   
ssc = StreamingContext(sc, 10)

submissions = ssc.socketTextStream(HOST, STREAM_PORT)

your 131072x1 screen size is bogus. expect trouble
23/05/02 13:44:18 WARN Utils: Your hostname, DESKTOP-CBR75GN resolves to a loopback address: 127.0.1.1; using 172.28.139.190 instead (on interface eth0)
23/05/02 13:44:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/02 13:44:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sc.setLogLevel("ERROR")

In [5]:
def process_submission(message):

    submission = json.loads(message)

    title = submission['message']
    metadata = submission['metadata']
    author = metadata['author_name']
    date = metadata['date']
    score = metadata['score']
    num_comments = metadata['num_comments']
    upvote_ratio = metadata['upvote_ratio']
    text = metadata['text']
    subreddit_name = metadata['subreddit_name']

    title_polarity, title_subjectivity = TextBlob(title).sentiment
    text_polarity, text_subjectivity = TextBlob(text).sentiment

    return {
        'title': title,
        'text': text,
        'author': author,
        'date': date,
        'score': score,
        'num_comments': num_comments,
        'upvote_ratio': upvote_ratio,
        'text': text,
        'subreddit_name': subreddit_name,
        'subreddit_hash': hash(subreddit_name),
        'title_polarity': title_polarity,
        'title_subjectivity': title_subjectivity,
        'text_polarity': text_polarity,
        'text_subjectivity': text_subjectivity
    }


### Add processing to the pipeline

In [6]:
submissions = submissions.map(process_submission)
# submissions.pprint()
training_data = submissions.map(lambda x: Vectors.dense(
    [x['title_polarity'], x['title_subjectivity'], x['text_polarity'], x['text_subjectivity'], x["subreddit_hash"]]))


### Cluster the data

In [7]:
k = 4
model = StreamingKMeans(k, decayFactor=1.0).setRandomCenters(5, 1.0, 0)
model.trainOn(training_data)
result = model.predictOn(training_data)

### Window operation

In [8]:
pairs = result.map(lambda cluster: (cluster, 1))  
# window of size 30s, and slides by 10s (very arbitrary)
cluseter_counts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10)
cluseter_counts.pprint()

### Visualization

In [9]:
# create viz.json if doesn't exist and set every cluster size to 0
with open("viz.json", "w") as f:
    total_data = {}
    for i in range(k):
        total_data[str(i)] = 0
    json.dump(total_data,f)

In [10]:
def update_cluster_sizes(rdd):
    if not rdd.isEmpty():
        for x in rdd.collect(): 
            with open("viz.json", "r") as f:
                total_data = json.load(f)
            total_data[str(x[0])] += x[1]
            with open("viz.json", "w") as f:
                json.dump(total_data,f)

In [11]:
# Update the cluster sizes in sliding windows
cluseter_counts.foreachRDD(lambda rdd: update_cluster_sizes(rdd))

### Start the processing

In [12]:
ssc.start()
ssc.awaitTermination()

                                                                                

-------------------------------------------
Time: 2023-05-02 13:44:30
-------------------------------------------



                                                                                

-------------------------------------------
Time: 2023-05-02 13:44:40
-------------------------------------------
(0, 66)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:44:50
-------------------------------------------
(0, 96)
(1, 47)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:45:00
-------------------------------------------
(0, 117)
(1, 69)
(2, 10)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:45:10
-------------------------------------------
(0, 63)
(1, 78)
(2, 14)
(3, 4)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:45:20
-------------------------------------------
(0, 59)
(1, 45)
(2, 21)
(3, 12)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:45:30
-------------------------------------------
(0, 66)
(1, 35)
(2, 22)
(3, 24)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:45:40
-------------------------------------------
(0, 69)
(1, 46)
(2, 28)
(3, 25)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:45:50
-------------------------------------------
(0, 59)
(1, 74)
(2, 28)
(3, 24)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:46:00
-------------------------------------------
(0, 49)
(1, 74)
(2, 26)
(3, 16)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:46:10
-------------------------------------------
(0, 50)
(1, 70)
(2, 18)
(3, 22)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:46:20
-------------------------------------------
(0, 52)
(1, 37)
(2, 17)
(3, 18)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:46:30
-------------------------------------------
(0, 49)
(1, 34)
(2, 18)
(3, 19)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:46:40
-------------------------------------------
(0, 54)
(1, 35)
(2, 29)
(3, 13)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:46:50
-------------------------------------------
(0, 51)
(1, 66)
(2, 30)
(3, 19)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:47:00
-------------------------------------------
(0, 52)
(1, 76)
(2, 35)
(3, 19)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:47:10
-------------------------------------------
(0, 44)
(1, 68)
(2, 31)
(3, 21)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:47:20
-------------------------------------------
(0, 47)
(1, 46)
(2, 25)
(3, 24)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:47:30
-------------------------------------------
(0, 49)
(1, 41)
(2, 15)
(3, 27)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:47:40
-------------------------------------------
(0, 52)
(1, 50)
(2, 11)
(3, 25)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:47:50
-------------------------------------------
(0, 59)
(1, 77)
(2, 23)
(3, 34)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:48:00
-------------------------------------------
(0, 55)
(1, 72)
(2, 25)
(3, 36)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:48:10
-------------------------------------------
(0, 53)
(1, 61)
(2, 30)
(3, 38)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:48:20
-------------------------------------------
(0, 41)
(1, 28)
(2, 21)
(3, 21)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:48:30
-------------------------------------------
(0, 41)
(1, 33)
(2, 21)
(3, 17)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:48:40
-------------------------------------------
(0, 38)
(1, 41)
(2, 20)
(3, 18)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:48:50
-------------------------------------------
(0, 35)
(1, 73)
(2, 21)
(3, 21)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:49:00
-------------------------------------------
(0, 33)
(1, 73)
(2, 24)
(3, 32)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:49:10
-------------------------------------------
(0, 30)
(1, 71)
(2, 25)
(3, 27)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:49:20
-------------------------------------------
(0, 30)
(1, 41)
(2, 32)
(3, 29)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:49:30
-------------------------------------------
(0, 33)
(1, 38)
(2, 32)
(3, 19)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:49:40
-------------------------------------------
(0, 33)
(1, 44)
(2, 32)
(3, 23)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:49:50
-------------------------------------------
(0, 36)
(1, 73)
(2, 28)
(3, 24)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:50:00
-------------------------------------------
(0, 36)
(1, 75)
(2, 27)
(3, 28)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:50:10
-------------------------------------------
(0, 41)
(1, 64)
(2, 27)
(3, 27)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:50:20
-------------------------------------------
(0, 51)
(1, 29)
(2, 33)
(3, 22)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:50:30
-------------------------------------------
(0, 58)
(1, 37)
(2, 34)
(3, 24)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:50:40
-------------------------------------------
(0, 45)
(1, 29)
(2, 24)
(3, 18)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:50:50
-------------------------------------------
(0, 49)
(1, 43)
(2, 15)
(3, 25)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:51:00
-------------------------------------------
(0, 41)
(1, 69)
(2, 14)
(3, 23)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:51:10
-------------------------------------------
(0, 52)
(1, 80)
(2, 30)
(3, 40)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:51:20
-------------------------------------------
(0, 51)
(1, 66)
(2, 42)
(3, 36)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:51:30
-------------------------------------------
(0, 56)
(1, 32)
(2, 41)
(3, 41)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:51:40
-------------------------------------------
(0, 59)
(1, 41)
(2, 43)
(3, 36)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:51:50
-------------------------------------------
(0, 56)
(1, 71)
(2, 35)
(3, 41)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:52:00
-------------------------------------------
(0, 59)
(1, 78)
(2, 42)
(3, 39)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:52:10
-------------------------------------------
(0, 67)
(1, 71)
(2, 38)
(3, 39)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:52:20
-------------------------------------------
(0, 44)
(1, 33)
(2, 29)
(3, 26)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:52:30
-------------------------------------------
(0, 35)
(1, 24)
(2, 33)
(3, 27)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:52:40
-------------------------------------------
(0, 31)
(1, 21)
(2, 35)
(3, 28)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:52:50
-------------------------------------------
(0, 44)
(1, 38)
(2, 50)
(3, 43)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:53:00
-------------------------------------------
(0, 51)
(1, 62)
(2, 39)
(3, 47)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:53:10
-------------------------------------------
(0, 50)
(1, 61)
(2, 32)
(3, 46)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:53:20
-------------------------------------------
(0, 52)
(1, 59)
(2, 41)
(3, 48)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:53:30
-------------------------------------------
(0, 47)
(1, 38)
(2, 45)
(3, 46)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:53:40
-------------------------------------------
(0, 45)
(1, 44)
(2, 42)
(3, 43)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:53:50
-------------------------------------------
(0, 34)
(1, 35)
(2, 18)
(3, 28)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:54:00
-------------------------------------------
(0, 34)
(1, 62)
(2, 18)
(3, 21)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:54:10
-------------------------------------------
(0, 32)
(1, 60)
(2, 22)
(3, 21)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:54:20
-------------------------------------------
(0, 36)
(1, 67)
(2, 33)
(3, 32)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:54:30
-------------------------------------------
(0, 39)
(1, 35)
(2, 38)
(3, 31)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:54:40
-------------------------------------------
(0, 45)
(1, 38)
(2, 35)
(3, 36)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:54:50
-------------------------------------------
(0, 53)
(1, 66)
(2, 33)
(3, 32)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:55:00
-------------------------------------------
(0, 47)
(1, 64)
(2, 22)
(3, 35)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:55:10
-------------------------------------------
(0, 44)
(1, 54)
(2, 23)
(3, 30)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:55:20
-------------------------------------------
(0, 38)
(1, 30)
(2, 24)
(3, 30)



                                                                                

-------------------------------------------
Time: 2023-05-02 13:55:30
-------------------------------------------
(0, 39)
(1, 33)
(2, 27)
(3, 36)



Traceback (most recent call last):
  File "/home/maxime/Scientific_Programming/testvenv/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/maxime/Scientific_Programming/testvenv/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt: 

                                                                                

In [15]:
ssc.stop(stopSparkContext=True, stopGraceFully=True)

In [None]:
# # create a label encoder object
# le = LabelEncoder()
# # fit the encoder to the pandas column
# le.fit(submissions.map(lambda x: x['subreddit_name']).collect())
# # apply the fitted encoder to the pandas column
# submissions = submissions.map(lambda x: (le.transform([x['subreddit_name']])[0], x['title_polarity'], x['title_subjectivity'], x['text_polarity'], x['text_subjectivity']))
# training_data = submissions.map(lambda x: Vectors.dense([x[1], x[2], x[3], x[4]]))


In [None]:





# def get_sentiment_label(text):
#     blob = TextBlob(text)
#     if blob.sentiment.polarity > 0:
#         return "positive"
#     elif blob.sentiment.polarity < 0:
#         return "negative"
#     else:
#         return "neutral"


In [None]:
# class RedditAPIClient:
#     def __init__(self, host, port):
#         self.host = host
#         self.port = port
#         self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

#     def connect(self):
#         self.sock.connect((self.host, self.port))

#     def disconnect(self):
#         self.sock.close()

#     def process_data(self, data):
#         # Parse the JSON data received from the server
#         message = json.loads(data)['message']
#         metadata = json.loads(data)['metadata']

#         # Perform sentiment analysis on the message using TextBlob
#         blob = TextBlob(message)
#         sentiment = blob.sentiment.polarity

#         # Return the metadata and sentiment score as a tuple
#         return (metadata, sentiment)

#     def run_spark(self):
#         # Configure Spark
#         conf = SparkConf().setAppName("Reddit Sentiment Analysis")
#         sc = SparkContext(conf=conf)

#         # Create a DStream from the socket
#         dstream = sc.socketTextStream(self.host, self.port)

#         # Process the data stream using Spark and TextBlob
#         results = dstream.map(self.process_data).filter(lambda x: x[1] != 0)

#         # Print the results
#         results.pprint()

#         # Start the streaming context
#         sc.start()
#         sc.awaitTermination()


# if __name__ == '__main__':
#     # Initialize the client
#     client = RedditAPIClient('localhost', 12345)

#     # Connect to the server
#     client.connect()

#     # Run Spark on the data stream
#     client.run_spark()

#     # Disconnect from the server
#     client.disconnect()
