In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import col
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import udf, size, col
from pyspark.sql.types import (IntegerType, StringType, 
                               TimestampType, StructType,
                               StructField, ArrayType,
                               TimestampType)

from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover, RegexTokenizer, \
        CountVectorizer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline


import os

conf = pyspark.SparkConf()
#conf.set("spark.executor.extraClassPath",  os.path.join(os.getcwd(), 'extras/sqlite-jdbc-3.34.0.jar'))
#conf.set("spark.driver.extraClassPath", os.path.join(os.getcwd(), 'extras/sqlite-jdbc-3.34.0.jar'))

sc = pyspark.SparkContext(appName="TwitterStreamApp", conf=conf)

spark = SparkSession(sc)


In [None]:
#url = 'jdbc:sqlite:' + os.path.join(os.getcwd(), 'database.sqlite')

# Importación de los datos de entrenamiento

schema = StructType([StructField("sentiment", StringType()),
                   StructField("id", StringType()),
                   StructField("date", StringType()),
                   StructField("flag", StringType()),
                   StructField("user", StringType()),
                   StructField("transtext", StringType())
                  ])



df = spark.read.format('csv').schema(schema).load('datasets/tweets_clean.csv')

df.show()

In [None]:
df.groupBy("sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .distinct() \
    .show()

In [None]:
# División de los datos para entrenamiento y tests

(trainingData, testData) = df.randomSplit([0.99, 0.01], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


In [None]:
# Creación de un PipelinedModel

tokenizer = Tokenizer(inputCol="transtext", outputCol="words")
#stopwords = StopWordsRemover(inputCol="words", outputCol='tokens')
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(trainingData)
train_df = pipelineFit.transform(trainingData)
val_df = pipelineFit.transform(testData)
train_df.show(5)


In [None]:
# Regresión logística para casificar los tweets

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

In [None]:
predictions.groupBy("prediction") \
    .count() \
    .orderBy(col("count").desc()) \
    .distinct() \
    .show()

In [None]:
# Evaluación de los resultados

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [None]:
# Backup de los modelos

pipelineFit.save('models/final_idf.model')
lrModel.save('models/final_lr.model')

In [None]:
predictions.show()