In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import col
from pyspark.sql import Row
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import udf, size, col

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.sql.types import (IntegerType, StringType, 
                               TimestampType, StructType,
                               StructField, ArrayType,
                               TimestampType)
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler


import os

conf = pyspark.SparkConf()
conf.set("spark.executor.extraClassPath",  os.path.join(os.getcwd(), 'extras/sqlite-jdbc-3.34.0.jar'))
conf.set("spark.driver.extraClassPath", os.path.join(os.getcwd(), 'extras/sqlite-jdbc-3.34.0.jar'))

sc = pyspark.SparkContext(appName="TwitterStreamApp", conf=conf)

spark = SparkSession(sc)


In [34]:
url = 'jdbc:sqlite:' + os.path.join(os.getcwd(), 'database.sqlite')

#df = spark.read.jdbc(url, 'Tweet')
#targetUDF = udf(lambda x:int(x), IntegerType())

schema = StructType([StructField("sentiment", StringType()),
                   StructField("id", StringType()),
                   StructField("date", StringType()),
                   StructField("flag", StringType()),
                   StructField("user", StringType()),
                   StructField("transtext", StringType())
                  ])



df = spark.read.format('csv').schema(schema).load('datasets/tweets_clean.csv')
#df2 = df.withColumn("sentiment", targetUDF(df["target"]))
#map_sentiment = udf(lambda x : 
  #                  float(x.replace('Positive', '0').replace('Neutral', '1').replace('Negative', '2')))
#df = df.withColumn('sentiment', map_sentiment('sentiment')).select('transtext', 'sentiment')

df.show()

+---------+----------+--------------------+--------+---------------+--------------------+
|sentiment|        id|                date|    flag|           user|           transtext|
+---------+----------+--------------------+--------+---------------+--------------------+
|        0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|        0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|        0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|        0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|        0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|        0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|        0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|        0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|        0

In [35]:
df.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .distinct() \
    .show()

+---------+------+
|sentiment| count|
+---------+------+
|        0|800000|
|        4|800000|
+---------+------+



In [4]:
import re

def removeRegex(tokens: list) -> list:
    """
    Removes hashtags, call outs and web addresses from tokens.
    """
    expr    = '(@[A-Za-z0-a9_]+)|(#[A-Za-z0-9_]+)|'+\
              '(https?://[^\s<>"]+|www\.[^\s<>"]+)'
        
    regex   = re.compile(expr)

    cleaned = [t for t in tokens if not(regex.search(t)) if len(t) > 0]

    return list(filter(None, cleaned))

def normalize(tokens : list) -> list:
    """
    Removes non-english characters and returns lower case versions of words.
    """
    subbed   = [re.sub("[^a-zA-Z]+", "", s).lower() for s in tokens]
    
    filtered = filter(None, subbed)
    
    return list(filtered)


normalizeUDF = udf(normalize, ArrayType(StringType()))

removeWEBUDF = udf(removeRegex, ArrayType(StringType()))

In [46]:
tokenizer = Tokenizer(inputCol  = "transtext",
                      outputCol = "token")

df3 = tokenizer.transform(df)
# remove hashtags, call outs and web addresses
df4 = df3.withColumn("tokens_re", removeWEBUDF(df3["token"]))

# remove non english characters
df4 = df4.withColumn("tokens_clean", normalizeUDF(df4["tokens_re"]))

# rename columns
df5 = df4.drop("token","tokens_re")
df5 = df5.withColumnRenamed("tokens_clean", "tokens")\

# remove tweets where the tokens array is empty, i.e. where it was just
# a hashtag, callout, numbers, web adress etc.
df6 = df5.where(size(col("tokens")) > 0)

In [55]:

# set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
#evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

Training Dataset Count: 1119936
Test Dataset Count: 480064


In [57]:
tokenizer = Tokenizer(inputCol="transtext", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])
pipelineFit = pipeline.fit(trainingData)
train_df = pipelineFit.transform(trainingData)
val_df = pipelineFit.transform(testData)
train_df.show(5)


+---------+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+-----+
|sentiment|        id|                date|    flag|           user|           transtext|               words|                  tf|            features|label|
+---------+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+-----+
|        0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|[@switchfoot, htt...|(65536,[12429,164...|(65536,[12429,164...|  0.0|
|        0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|[is, upset, that,...|(65536,[1981,3085...|(65536,[1981,3085...|  0.0|
|        0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|[@kenichan, i, di...|(65536,[2888,3924...|(65536,[2888,3924...|  0.0|
|        0|1467811193|Mon Apr 06 22:19:...|NO_

In [58]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

In [59]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.8458981424549241

In [67]:
pipelineFit.save('models/final_idf.model')
lrModel.save('models/final_lr.model')

In [68]:
predictions.show()

+---------+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|sentiment|        id|                date|    flag|           user|           transtext|               words|                  tf|            features|label|       rawPrediction|         probability|prediction|
+---------+----------+--------------------+--------+---------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|        0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|[my, whole, body,...|(65536,[1880,9243...|(65536,[1880,9243...|  0.0|[1.73435231498434...|[0.84996828283741...|       0.0|
|        0|1467812416|Mon Apr 06 22:20:...|NO_QUERY| erinx3leannexo|spring break in p...|[spring, break, i...|(65536,[5284,3026...|(65536,[5284,3026...|

In [27]:
## regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="transtext", outputCol="words", pattern="\\W")

## stop words
add_stopwords = ["http","https","amp","rt","t","c","the","@"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

## bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

## convert string labels to indexes
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, seed=100)

## build the pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx, rf])


In [21]:
# create tokens from tweets
tk = Tokenizer(inputCol= "transtext", outputCol = "tokens2")

# create term frequencies for each of the tokens
sw  = StopWordsRemover(inputCol="tokens2", outputCol="filtered")
#tf1 = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1e5)
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

# create tf-idf for each of the tokens
#idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

# create basic logistic regression model
#lr = LogisticRegression(maxIter=20)
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, seed=100)

# create entire pipeline
pipeline = Pipeline(stages=[tk, sw, countVectors, label_stringIdx, rf])

In [22]:
model = pipeline.fit(trainingData)

In [23]:
# predict on test set
predictions1 = model.transform(testData)

# get the performance on the test set
score1 = evaluator.evaluate(predictions1)

print("AUC SCORE: {}".format(score1))

AUC SCORE: 0.5045871559633027


In [24]:
predictedAndLabels = predictions1.select(["prediction","sentiment"])\
                                 .rdd.map(lambda r : (float(r[0]), float(r[1])))

from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(predictedAndLabels)

print("Test Set Accuracy: {}".format(metrics.accuracy))

Test Set Accuracy: 0.5045871559633027


In [41]:
predictions.groupBy("prediction") \
    .count() \
    .orderBy(col("count").desc()) \
    .distinct() \
    .show()

+----------+------+
|prediction| count|
+----------+------+
|       0.0|236162|
|       1.0|243902|
+----------+------+



In [None]:
model.save("models/sentiment3.model")


In [28]:

## Fit the pipeline to training documents.
pipelineFit = pipeline.fit(trainingData)
predictions = pipelineFit.transform(testData)

predictions.select("transtext","sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 40)
#predictions.filter(predictions['prediction'] == 0.0) \
#    .select("_c5","_c0","probability","label","prediction") \
#    .orderBy("probability", ascending=False) \
#    .show(n = 10, truncate = 30)
#
#predictions.filter(predictions['prediction'] == 1.0) \
#    .select("_c5","_c0","probability","label","prediction") \
#    .orderBy("probability", ascending=False) \
#    .show(n = 10, truncate = 30)
#    
#predictions.filter(predictions['prediction'] == 2.0) \
#    .select("_c5","_c0","probability","label","prediction") \
#    .orderBy("probability", ascending=False) \
#    .show(n = 10, truncate = 30)    

# Evaluate, metricName=[accuracy | f1]default f1 measure
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
print("Accuracy: %g" % (evaluator.evaluate(predictions)))

# save the trained model for future use
pipelineFit.save("models/sentiment_4.model")

# PipelineModel.load("logreg.model")

+----------------------------------------+---------+----------------------------------------+-----+----------+
|                               transtext|sentiment|                             probability|label|prediction|
+----------------------------------------+---------+----------------------------------------+-----+----------+
|RT @Schrodingergata: Look that I thin...|      0.0|[0.612140336685387,0.2414551091608311...|  0.0|       0.0|
|RT @Schrodingergata: Look that I thin...|      0.0|[0.612140336685387,0.2414551091608311...|  0.0|       0.0|
|RT @Sanchezrm_: How proud of Real Mad...|      0.0|[0.5899025841931839,0.228770687339982...|  0.0|       0.0|
|Go crack !!!!!  Congratulations @Turm...|      0.0|[0.5870870398050767,0.273805963827021...|  0.0|       0.0|
|RT @ECobeNefactor: All for concerted ...|      1.0|[0.5677113201471531,0.256731466657365...|  1.0|       0.0|
|@Pecobo_Foto @el_pais Dream daily wit...|      0.0|[0.5672565683512226,0.272343966791321...|  0.0|       0.0|
|