In [1]:
import os
import sys
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import json
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, udf
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
import re
import pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import PipelineModel


In [2]:
#create Spark session:
spark = SparkSession \
    .builder \
    .master("local[*]")\
    .appName("Sentiment Analysis in Spark") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate()

In [3]:
#Import cleaning tools
from Clean_Func import convert_to_bytes, bytes_to_json, clean_tweet_content

In [4]:
#Preprocessing
# Pipeline stages
tokenizer = Tokenizer(inputCol="tweet_content", outputCol="Tweet_vect_")
remover = StopWordsRemover(inputCol="Tweet_vect_", outputCol="Filtered_tweet")
hashTF = HashingTF(inputCol="Filtered_tweet", outputCol="features")
lr = LogisticRegression(labelCol="sentiment", featuresCol="features", maxIter=10, regParam=0.01)

# Define Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashTF, lr])

# Load and preprocess data
data__ = []
with open('twitter_training.csv', 'r', errors='ignore') as file:
    for line in file:
        json_data = convert_to_bytes(line)
        data__.append(json.dumps(json_data).encode('utf-8'))

js_data__ = [bytes_to_json(dt) for dt in data__]
df_senti = pd.DataFrame(js_data__)

for i in range(len(df_senti['tweet_content'])):
    df_senti['tweet_content'][i] = clean_tweet_content(df_senti['tweet_content'][i])

for i in range(len(df_senti['sentiment'])):
    if df_senti['sentiment'][i] == 'Positive':
        df_senti['sentiment'][i] = 0
    elif df_senti['sentiment'][i] == 'Neutral':
        df_senti['sentiment'][i] = 1
    elif df_senti['sentiment'][i] == 'Negative':
        df_senti['sentiment'][i] = 2
    elif df_senti['sentiment'][i] == 'Irrelevant':
        df_senti['sentiment'][i] = 3

data_u = df_senti[['tweet_content', 'sentiment']]
spark_df = spark.createDataFrame(data_u)

# Train model
pipeline_model = pipeline.fit(spark_df)

# Save the model
#pipeline_model.save('pipeline_v1')

In [5]:
# Load the model to use:
loaded_model = PipelineModel.load('pipeline_v1')

# Split data into training and testing sets
train_data, test_data = spark_df.randomSplit([0.8, 0.2], seed=442)

# Make predictions on the test data
predictions = loaded_model.transform(test_data)

# Model evaluation
evaluator = MulticlassClassificationEvaluator(labelCol="sentiment", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 89.04%
