In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [None]:
# Load Data with Spark

spark = SparkSession.builder \
    .appName("Twitter Sentiment Analysis") \
    .getOrCreate()

df = spark.read.csv("training.1600000.processed.noemoticon.csv", header=True, inferSchema=True)

In [None]:
# Data Preprocessing with Spark:

df_clean = df.withColumn("text", regexp_replace(col("text"), "@\w+", "")) \
.withColumn("text", regexp_replace(col("text"), "http\S+", "")) \
.withColumn("text", regexp_replace(col("text"), "[^a-zA-Z\s]", ""))

In [None]:
# Feature Engineering and Model Training

tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
vectorizer = CountVectorizer(inputCol="filtered", outputCol="features")
idf = IDF(inputCol="features", outputCol="idf_features")
classifier = LogisticRegression(featuresCol="idf_features", labelCol="label")

pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf, classifier])
model = pipeline.fit(df_clean)

In [None]:
# Prediction and Evaluation

predictions = model.transform(test_data)
predictions.select("text", "prediction").show()