In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType 
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, HashingTF, IDF,Tokenizer, VectorIndexer, StopWordsRemover
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from preprocessing import TransformText, CleanText
import pandas as pd

# Import Data

In [2]:
spark = SparkSession.builder.master("local[1]") \
        .appName("Sentiment Spark Training") \
        .getOrCreate()
        
schema = StructType() \
        .add("tweet_id", IntegerType(), False) \
        .add("entity", StringType(), True) \
        .add("sentiment", StringType(), True)  \
        .add("content", StringType(), True)
        
tweets = spark.read.option("header", True) \
        .schema(schema) \
        .csv("./data/twitter_training.csv")
cleaned_tweets = tweets.withColumn('content', when(isnull(col('content')), "") \
                .otherwise(col('content')))

In [3]:
cleaned_tweets.printSchema()
cleaned_tweets.show(5)

root
 |-- tweet_id: integer (nullable = true)
 |-- entity: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- content: string (nullable = true)

+--------+-----------+---------+--------------------+
|tweet_id|     entity|sentiment|             content|
+--------+-----------+---------+--------------------+
|    2401|Borderlands| Positive|I am coming to th...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im coming on bord...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im getting into b...|
+--------+-----------+---------+--------------------+
only showing top 5 rows



In [4]:
print("Total of tweets:  ", cleaned_tweets.count())
print("Total of unique entities:  ", cleaned_tweets.select('entity').distinct().count())
print("Total of unique sentiments:  ", cleaned_tweets.select('sentiment').distinct().count())
print(cleaned_tweets.select('sentiment').distinct().show())

Total of tweets:   74681
Total of unique entities:   32
Total of unique sentiments:   4
+----------+
| sentiment|
+----------+
|Irrelevant|
|  Positive|
|   Neutral|
|  Negative|
+----------+

None


# Transform Data

In [5]:
encoder = StringIndexer(inputCol='sentiment', outputCol='sentiment_label').fit(cleaned_tweets)
tweets = encoder.transform(cleaned_tweets)

print(tweets.groupBy('sentiment_label').count().show())

+---------------+-----+
|sentiment_label|count|
+---------------+-----+
|            0.0|22542|
|            1.0|20831|
|            3.0|12990|
|            2.0|18318|
+---------------+-----+

None


In [6]:
stopwords = []
with open(file='./data/stopword_en.txt', mode='r') as file:
    try:
        stopwords = file.readlines()
        for idx, word in enumerate(stopwords):
            stopwords[idx] = word[:len(word)-1:]
    except Exception as ex:
        print(ex)
tweets = tweets.withColumn('tokens', split('content', ' '))
print(tweets.collect()[1])

Row(tweet_id=2401, entity='Borderlands', sentiment='Positive', content='im getting on borderlands and i will kill you all,', sentiment_label=1.0, tokens=['im', 'getting', 'on', 'borderlands', 'and', 'i', 'will', 'kill', 'you', 'all,'])


In [18]:
remover = StopWordsRemover(stopWords=stopwords, inputCol="tokens", outputCol="stop")
outputcols = ['tweet_id','entity','sentiment','sentiment_label']
temp = remover.transform(tweets).select(*outputcols, 
                                        array_join("stop", " ").alias("content"))

In [19]:
temp = temp.withColumn(
    'cleaned_content', 
    translate('content', '!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~', '')
)
temp = temp.withColumn(
    "cleaned_content",
    regexp_replace(regexp_replace("cleaned_content", "[^\x00-\x7F]+", ""), '""', '')
)
cleaned_tweets = temp.drop('content')

In [20]:
cleaned_tweets.collect()[71840]

Row(tweet_id=11106, entity='TomClancysGhostRecon', sentiment='Positive', sentiment_label=1.0, cleaned_content='See  song Im talking smh Tryna play head work ugh gotta wait ')

In [10]:
tokenizer = Tokenizer(inputCol='cleaned_content', outputCol='words')
tokenized_tweets = tokenizer.transform(cleaned_tweets)

hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=20)
featurized_data = hashingTF.transform(tokenized_tweets)

idf = IDF(inputCol="raw_features", outputCol="features")
tfidf_tweets = idf.fit(featurized_data).transform(featurized_data).select(['sentiment_label', 'features'])

In [11]:
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexed_features", maxCategories=4).fit(tfidf_tweets)
tweets_train = featureIndexer.transform(tfidf_tweets)
tweets_train = tweets_train.select(['sentiment_label','indexed_features'])

In [12]:
print(tweets_train.printSchema())
print(tweets_train.show(5))

root
 |-- sentiment_label: double (nullable = false)
 |-- indexed_features: vector (nullable = true)

None
+---------------+--------------------+
|sentiment_label|    indexed_features|
+---------------+--------------------+
|            1.0|(20,[1,6,9,12],[1...|
|            1.0|(20,[0,6,12],[0.8...|
|            1.0|(20,[9,11,12],[1....|
|            1.0|(20,[0,11,12],[0....|
|            1.0|(20,[0,11,12],[0....|
+---------------+--------------------+
only showing top 5 rows

None


# Train model

In [13]:
dt = DecisionTreeClassifier(labelCol="sentiment_label", featuresCol="indexed_features")
model = dt.fit(tweets_train)

# Predict and Evaluate

In [14]:
transformer = TransformText()
data = transformer.data_loader(schema, spark, "./data/twitter_validation.csv")
cleaned_tweets = data.withColumn('content', when(isnull(col('content')), "").otherwise(col('content')))

In [15]:
cleaned_tweets.show(5)

+--------+---------------+----------+--------------------+
|tweet_id|         entity| sentiment|             content|
+--------+---------------+----------+--------------------+
|     921| AssassinsCreed|   Neutral|Get ready for the...|
|    4367|          CS-GO|Irrelevant|friendship ended ...|
|    7446|LeagueOfLegends|  Negative|My League of Lege...|
|    5417|    Hearthstone|  Positive|Hey guys, I just ...|
|    4650|         Google|  Positive|It's great that b...|
+--------+---------------+----------+--------------------+
only showing top 5 rows



In [16]:
encoded_tweets_test = encoder.transform(cleaned_tweets)
print(encoded_tweets_test.groupBy('sentiment').count().show())

+----------+-----+
| sentiment|count|
+----------+-----+
|Irrelevant|  110|
|   Neutral|  188|
|  Positive|  172|
|  Negative|  168|
+----------+-----+

None


In [25]:
outputCols = ['tweet_id','entity','sentiment','sentiment_label','content']
cleaner = CleanText(outputCols)
cleaned_tweets = cleaner.transform(encoded_tweets_test)

TypeError: CleanText.__init__() takes 1 positional argument but 2 were given

In [None]:
tweets_test = transformer.transform()

+---------------+-----+
|sentiment_label|count|
+---------------+-----+
|            0.0|  188|
|            1.0|  172|
|            3.0|  110|
|            2.0|  168|
+---------------+-----+

None


In [None]:
tweets_test = tweets_test.select(['sentiment_label','indexed_features'])
tweets_test.show(5)

+---------------+--------------------+
|sentiment_label|    indexed_features|
+---------------+--------------------+
|            0.0|(20,[0,4,5,6,7,8,...|
|            3.0|(20,[2,3,4,5,8,9,...|
|            2.0|(20,[1,3,4,5,7,8,...|
|            1.0|(20,[0,1,2,3,4,5,...|
|            1.0|(20,[0,1,2,3,5,6,...|
+---------------+--------------------+
only showing top 5 rows



In [None]:
predictions = model.transform(tweets_train)
predictions = predictions.select(['sentiment_label','prediction'])

In [None]:
predictions.show(10)

+---------------+----------+
|sentiment_label|prediction|
+---------------+----------+
|            1.0|       0.0|
|            1.0|       1.0|
|            1.0|       1.0|
|            1.0|       2.0|
|            1.0|       1.0|
|            1.0|       0.0|
|            1.0|       0.0|
|            1.0|       0.0|
|            1.0|       0.0|
|            1.0|       0.0|
+---------------+----------+
only showing top 10 rows



In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="sentiment_label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " %(1.0 - accuracy))
print("Test Accuracy = %g " %accuracy)

Test Error = 0.655923 
Test Accuracy = 0.344077 
