### Install requirements

In [None]:
# !pip install textblob 
# !pip install pyspark

### Import libraries

In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.mllib.linalg import Vectors
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
import json
import requests

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
HOST = "localhost"        
STREAM_PORT = 9999     

sc = SparkContext.getOrCreate()
sc.setCheckpointDir("spark_checkpoint")   
ssc = StreamingContext(sc, 10)

submissions = ssc.socketTextStream(HOST, STREAM_PORT)

In [None]:
def process_submission(submission):

    submission = json.loads(submission)

    title = submission['title']
    metadata = submission['metadata']
    author = metadata['author']
    date = metadata['date']
    score = metadata['score']
    num_comments = metadata['num_comments']
    upvote_ratio = metadata['upvote_ratio']
    text = submission['text']
    subreddit_name = submission['subreddit_name']

    title_polarity, title_subjectivity = TextBlob(title).sentiment
    text_polarity, text_subjectivity = TextBlob(text).sentiment

    return {
        'title': title,
        'text': text,
        'author': author,
        'date': date,
        'score': score,
        'num_comments': num_comments,
        'upvote_ratio': upvote_ratio,
        'text': text,
        'subreddit_name': subreddit_name,
        'title_polarity': title_polarity,
        'title_subjectivity': title_subjectivity,
        'text_polarity': text_polarity,
        'text_subjectivity': text_subjectivity
    }


### Add processing to the pipeline

In [None]:
submissions = submissions.map(process_submission)
training_data = submissions.map(lambda x: Vectors.dense(
    [x['title_polarity'], x['title_subjectivity'], x['text_polarity'], x['text_subjectivity']]))


### Cluster the data

In [None]:
model = StreamingKMeans(k=4, decayFactor=1.0).setRandomCenters(4, 1.0, 0)
model.trainOn(training_data)
result = model.predictOn(training_data)

### Window operation

In [None]:
pairs = result.map(lambda cluster: (f'cluster-{cluster+1}', 1))  
# window of size 30s, and slides by 10s (very arbitrary)
cluseter_counts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10)
# print the results
print(f'Cluster counts: {cluseter_counts}')

### Start the processing

In [None]:
ssc.start()
ssc.awaitTermination()

In [None]:
# # create a label encoder object
# le = LabelEncoder()
# # fit the encoder to the pandas column
# le.fit(submissions.map(lambda x: x['subreddit_name']).collect())
# # apply the fitted encoder to the pandas column
# submissions = submissions.map(lambda x: (le.transform([x['subreddit_name']])[0], x['title_polarity'], x['title_subjectivity'], x['text_polarity'], x['text_subjectivity']))
# training_data = submissions.map(lambda x: Vectors.dense([x[1], x[2], x[3], x[4]]))


In [None]:





# def get_sentiment_label(text):
#     blob = TextBlob(text)
#     if blob.sentiment.polarity > 0:
#         return "positive"
#     elif blob.sentiment.polarity < 0:
#         return "negative"
#     else:
#         return "neutral"


In [None]:
# class RedditAPIClient:
#     def __init__(self, host, port):
#         self.host = host
#         self.port = port
#         self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

#     def connect(self):
#         self.sock.connect((self.host, self.port))

#     def disconnect(self):
#         self.sock.close()

#     def process_data(self, data):
#         # Parse the JSON data received from the server
#         message = json.loads(data)['message']
#         metadata = json.loads(data)['metadata']

#         # Perform sentiment analysis on the message using TextBlob
#         blob = TextBlob(message)
#         sentiment = blob.sentiment.polarity

#         # Return the metadata and sentiment score as a tuple
#         return (metadata, sentiment)

#     def run_spark(self):
#         # Configure Spark
#         conf = SparkConf().setAppName("Reddit Sentiment Analysis")
#         sc = SparkContext(conf=conf)

#         # Create a DStream from the socket
#         dstream = sc.socketTextStream(self.host, self.port)

#         # Process the data stream using Spark and TextBlob
#         results = dstream.map(self.process_data).filter(lambda x: x[1] != 0)

#         # Print the results
#         results.pprint()

#         # Start the streaming context
#         sc.start()
#         sc.awaitTermination()


# if __name__ == '__main__':
#     # Initialize the client
#     client = RedditAPIClient('localhost', 12345)

#     # Connect to the server
#     client.connect()

#     # Run Spark on the data stream
#     client.run_spark()

#     # Disconnect from the server
#     client.disconnect()
