In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, when
import nltk
from nltk.corpus import stopwords

In [2]:
spark = SparkSession.builder \
    .appName("Read") \
    .getOrCreate()

schema = StructType([
    StructField("Tweet ID", IntegerType(), True),
    StructField("Entity", StringType(), True),
    StructField("Sentiment", StringType(), True),
    StructField("Tweet content", StringType(), True)
])

df = spark.read.csv("twitter_training.csv", header=True, schema=schema)
df.show()


+--------+-----------+---------+--------------------+
|Tweet ID|     Entity|Sentiment|       Tweet content|
+--------+-----------+---------+--------------------+
|    2401|Borderlands| Positive|I am coming to th...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im coming on bord...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im getting into b...|
|    2402|Borderlands| Positive|So I spent a few ...|
|    2402|Borderlands| Positive|So I spent a coup...|
|    2402|Borderlands| Positive|So I spent a few ...|
|    2402|Borderlands| Positive|So I spent a few ...|
|    2402|Borderlands| Positive|2010 So I spent a...|
|    2402|Borderlands| Positive|                 was|
|    2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|    2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|    2403|Borderlands|  Neutral|Rock-Hard La Varl...|
|    2403|Borderlands|  Neutral|Rock-Hard La Vita...|
|    2403|Borderlands|  Neut

In [3]:
df_cleaned = df.withColumn('Tweet content', when(col('Tweet content').isNull(),'').otherwise(col('Tweet content')))

In [4]:
tokenizer = Tokenizer(inputCol="Tweet content", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
indexer = StringIndexer(inputCol="Sentiment", outputCol="label",handleInvalid="skip")
assembler = VectorAssembler(inputCols=["features"], outputCol="final_features")

lr = LogisticRegression(featuresCol='final_features', labelCol='label')

df_cleaned= indexer.fit(df_cleaned).transform(df_cleaned)

In [5]:
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, assembler, lr])

In [6]:
train_data, test_data = df_cleaned.randomSplit([0.8, 0.2], seed=123)

In [7]:
pipeline_model = pipeline.fit(train_data)

In [8]:
predictions = pipeline_model.transform(test_data)

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


evaluator_accuracy = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="f1")
evaluator_recall = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="weightedRecall")

accuracy = evaluator_accuracy.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)


In [15]:
print("accuracy : " ,accuracy)
print("f1_score" , f1_score)
print("recall" , recall)

accuracy :  0.8257951952758019
f1_score 0.8258296978404729
recall 0.8257951952758019


In [16]:
pipeline_model.save('pipeline_model')