In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType 
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, HashingTF, IDF,Tokenizer, StopWordsRemover
from pyspark.ml.classification import DecisionTreeClassifier, MultilayerPerceptronClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from preprocessing import TransformText, CleanText
import pandas as pd

# Import Data

In [2]:
spark = SparkSession.builder.master("local[1]") \
        .appName("Sentiment Spark Training") \
        .getOrCreate()
        
schema = StructType() \
        .add("tweet_id", IntegerType(), False) \
        .add("entity", StringType(), True) \
        .add("sentiment", StringType(), True)  \
        .add("content", StringType(), True)
        
tweets = spark.read.option("header", True) \
        .schema(schema) \
        .csv("./data/twitter_training.csv")
tweets = tweets.withColumn('content', when(isnull(col('content')), "") \
                .otherwise(col('content')))

In [3]:
tweets.printSchema()
tweets.show(5)

root
 |-- tweet_id: integer (nullable = true)
 |-- entity: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- content: string (nullable = true)

+--------+-----------+---------+--------------------+
|tweet_id|     entity|sentiment|             content|
+--------+-----------+---------+--------------------+
|    2401|Borderlands| Positive|I am coming to th...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im coming on bord...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im getting into b...|
+--------+-----------+---------+--------------------+
only showing top 5 rows



In [4]:
print("Total of tweets:  ", tweets.count())
print("Total of unique entities:  ", tweets.select('entity').distinct().count())
print("Total of unique sentiments:  ", tweets.select('sentiment').distinct().count())
print(tweets.select('sentiment').distinct().show())

Total of tweets:   74681
Total of unique entities:   32
Total of unique sentiments:   4
+----------+
| sentiment|
+----------+
|Irrelevant|
|  Positive|
|   Neutral|
|  Negative|
+----------+

None


# Transform Data

In [5]:
stopwords = []
with open(file='./data/stopword_en.txt', mode='r') as file:
    try:
        stopwords = file.readlines()
        for idx, word in enumerate(stopwords):
            stopwords[idx] = word[:len(word)-1:]
    except Exception as ex:
        print(ex)
tweets = tweets.withColumn('tokens', split('content', ' '))
print(tweets.collect()[1])

Row(tweet_id=2401, entity='Borderlands', sentiment='Positive', content='im getting on borderlands and i will kill you all,', tokens=['im', 'getting', 'on', 'borderlands', 'and', 'i', 'will', 'kill', 'you', 'all,'])


In [6]:
remover = StopWordsRemover(stopWords=stopwords, inputCol="tokens", outputCol="stop")
outputcols = ['tweet_id','entity','sentiment']
temp = remover.transform(tweets).select(*outputcols, 
                                        array_join("stop", " ").alias("content"))

In [7]:
temp = temp.withColumn(
    'cleaned_content', 
    translate('content', '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', '')
)
temp = temp.withColumn(
    "cleaned_content",
    regexp_replace(regexp_replace("cleaned_content", "[^\x00-\x7F]+", ""), '""', '')
)
temp = temp.withColumn(
    "cleaned_content",
    lower("cleaned_content")
)
cleaned_tweets = temp.drop('content')

In [8]:
cleaned_tweets.collect()[120]

Row(tweet_id=2421, entity='Borderlands', sentiment='Positive', cleaned_content='finally played borderlands 3 actual game great almost weapons feel funny never felt like grind hand say much looking forward eventual sequence skip')

In [9]:
encoder = StringIndexer(inputCol='sentiment', outputCol='sentiment_label').fit(cleaned_tweets)
encoded_tweets = encoder.transform(cleaned_tweets)

print(encoded_tweets.groupBy('sentiment_label').count().show())

+---------------+-----+
|sentiment_label|count|
+---------------+-----+
|            0.0|22542|
|            1.0|20831|
|            3.0|12990|
|            2.0|18318|
+---------------+-----+

None


In [10]:
tokenizer = Tokenizer(inputCol='cleaned_content', outputCol='words')
tokenized_tweets = tokenizer.transform(encoded_tweets)

hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=20)
featurized_data = hashingTF.transform(tokenized_tweets)

In [11]:
idf = IDF(inputCol="raw_features", outputCol="features").fit(featurized_data)
tfidf_tweets = idf.transform(featurized_data)
tweets_train = tfidf_tweets.select('sentiment_label','features')

In [12]:
print(tweets_train.printSchema())
print(tweets_train.show(5))

root
 |-- sentiment_label: double (nullable = false)
 |-- features: vector (nullable = true)

None
+---------------+--------------------+
|sentiment_label|            features|
+---------------+--------------------+
|            1.0|(20,[1,6,9,12],[1...|
|            1.0|(20,[0,6,12],[0.8...|
|            1.0|(20,[9,11,12],[1....|
|            1.0|(20,[0,11,12],[0....|
|            1.0|(20,[0,11,12],[0....|
+---------------+--------------------+
only showing top 5 rows

None


# Train model

In [13]:
# dt = DecisionTreeClassifier(labelCol="sentiment_label", featuresCol="features", maxDepth=20, impurity='gini')
rf = RandomForestClassifier(labelCol="sentiment_label", featuresCol="features", numTrees=50, maxDepth=10)

# layers = [20, 10, 10, 32, 10, 4]
# trainer = MultilayerPerceptronClassifier(labelCol="sentiment_label", maxIter=100, layers=layers, blockSize=32, seed=1234)

model = rf.fit(tweets_train)

# Predict and Evaluate

In [14]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="sentiment_label", metricName="accuracy")

- Evaluate Training dataset

In [15]:
predictions = model.transform(tweets_train)
predictions = predictions.select('sentiment_label','prediction')

In [16]:
predictions.printSchema()

root
 |-- sentiment_label: double (nullable = false)
 |-- prediction: double (nullable = false)



In [17]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " %(1.0 - accuracy))
print("Test Accuracy = %g " %accuracy)

Test Error = 0.53999 
Test Accuracy = 0.46001 


- Evaluate Testing dataset

In [18]:
transformer = TransformText('sentiment', 'cleaned_content')
data = transformer.data_loader(schema, spark, "./data/twitter_validation.csv")

In [19]:
print(data.printSchema())

root
 |-- tweet_id: integer (nullable = true)
 |-- entity: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- content: string (nullable = true)

None


In [20]:
outputCols = ['tweet_id','entity','sentiment']
target = 'content'

cleaner = CleanText(outputCols, target)
cleaned_tweets = cleaner.transform(data)

In [21]:
cleaned_tweets.collect()[120]

Row(tweet_id=2600, entity='Borderlands', sentiment='Positive', content='want thank #SSKYWILDKATSSS letting run new Borderlands 3 DLC last night. lot fun!', cleaned_content='want thank sskywildkatsss letting run new borderlands 3 dlc last night lot fun')

In [22]:
tweets_test = transformer.transform(cleaned_tweets)
print(tweets_test.printSchema())

+---------------+-----+
|sentiment_label|count|
+---------------+-----+
|            0.0|  188|
|            1.0|  172|
|            3.0|  110|
|            2.0|  168|
+---------------+-----+

None
root
 |-- tweet_id: integer (nullable = true)
 |-- entity: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- content: string (nullable = true)
 |-- cleaned_content: string (nullable = true)
 |-- sentiment_label: double (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_features: vector (nullable = true)
 |-- features: vector (nullable = true)

None


In [23]:
tweets_test = tweets_test.select(['sentiment_label','features'])

In [24]:
predictions = model.transform(tweets_test)
predictions = predictions.select(['sentiment_label','prediction'])

In [25]:
predictions.show(10)

+---------------+----------+
|sentiment_label|prediction|
+---------------+----------+
|            0.0|       0.0|
|            3.0|       0.0|
|            2.0|       0.0|
|            1.0|       1.0|
|            1.0|       0.0|
|            3.0|       0.0|
|            0.0|       0.0|
|            3.0|       0.0|
|            1.0|       1.0|
|            0.0|       2.0|
+---------------+----------+
only showing top 10 rows



In [26]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " %(1.0 - accuracy))
print("Test Accuracy = %g " %accuracy)

Test Error = 0.68652 
Test Accuracy = 0.31348 
