In [None]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-2.4.6'
spark_version = 'spark-2.4.7'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Co0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Hit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PerceptronModels").config('spark.executor.memory', '8G').config('spark.driver.memory', '8G').config('spark.driver.maxResultSize', '8G').config("spark.memory.offHeap.size","8g").config("spark.memory.offHeap.enabled",True).getOrCreate()

In [None]:
# Read in data from Github
from pyspark import SparkFiles
url ="https://raw.githubusercontent.com/James-Ashley/sentiment-analysis-dashboard/main/sentiment_classification/preprocessed_headlines.json"
spark.sparkContext.addFile(url)
df_git = spark.read.json(SparkFiles.get("preprocessed_headlines.json"))

In [None]:
df_git.show()

+--------------------+--------------------+--------------+-----------+--------------+-------------+--------------+--------------------+---------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              author|   cleaned_headlines|compound_score|    keyword|negative_score|neutral_score|positive_score|           published|sentiment_human|  source|       text_complete|        text_excerpt|               tfidf|               title|              tokens|         tokens_lems|                 url|
+--------------------+--------------------+--------------+-----------+--------------+-------------+--------------+--------------------+---------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           Dip Patel|could deport pare...|        0.1027|immigration|      

### Feature Transformations


In [None]:
df_git_clean = df_git.select('sentiment_human', 'tokens_lems').withColumnRenamed('sentiment_human', 'label')

#df_git_clean.show()

In [None]:
from pyspark.sql.functions import when

df_git_clean = df_git_clean.withColumn("label", \
              when(df_git_clean["label"] == -1, 2).otherwise(df_git_clean["label"]))

#df_git_clean.show()

In [None]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer
# Create all the features for the data set

# hashing
hashingTF = HashingTF(numFeatures=2**13, inputCol="tokens_lems", outputCol='hash_token')
# idf
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors - this assemble all columns you want to use as features
clean_up = VectorAssembler(inputCols=['idf_token'], outputCol='features')

In [None]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[hashingTF, idf, clean_up])

In [None]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(df_git_clean)
cleaned = cleaner.transform(df_git_clean)
#cleaned.show()

In [None]:
# Show label and resulting features
cleaned_final = cleaned.select(['label', 'features'])
cleaned_final.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|(8192,[309,1322,2...|
|    0|(8192,[145,191,38...|
|    0|(8192,[1506,4808,...|
|    0|(8192,[110,655,69...|
|    2|(8192,[33,1002,17...|
|    2|(8192,[1805,2844,...|
|    1|(8192,[1446,1621,...|
|    2|(8192,[159,497,13...|
|    2|(8192,[1575,1836,...|
|    0|(8192,[265,322,14...|
|    1|(8192,[191,537,20...|
|    1|(8192,[697,1598,5...|
|    0|(8192,[1941,5372,...|
|    0|(8192,[368,1002,1...|
|    2|(8192,[1065,3756,...|
|    0|(8192,[110,966,15...|
|    2|(8192,[110,345,92...|
|    0|(8192,[191,4541,4...|
|    0|(8192,[309,2566,4...|
|    1|(8192,[111,223,54...|
+-----+--------------------+
only showing top 20 rows



In [None]:
# Break data down into a training set and a testing set (train with 70%, test with 30%)
#training, testing = cleaned_final.randomSplit([0.8, 0.2])

# Perform a stratified split to preserve class distribution
# Source: https://stackoverflow.com/questions/47637760/stratified-sampling-with-pyspark

# split dataframes between 0s, 1s, and 2s
zeros = cleaned_final.filter(cleaned_final["label"]==0)
ones = cleaned_final.filter(cleaned_final["label"]==1)
twos = cleaned_final.filter(cleaned_final["label"]==2)

# split datasets into training and testing

train0, test0 = zeros.randomSplit([0.8,0.2], seed=1234)
train1, test1 = ones.randomSplit([0.8,0.2], seed=1234)
train2, test2 = twos.randomSplit([0.8,0.2], seed=1234)
# stack datasets back together
training = train0.union(train1).union(train2)
testing = test0.union(test1).union(test2)

In [None]:
from pyspark.ml.classification import NaiveBayes
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [None]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
#test_results.show(5)

In [None]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting sentiment was: %f" % acc)

Accuracy of model at predicting sentiment was: 0.634867


In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest

# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(training)

# score the model on test data.
predictions = ovrModel.transform(testing)
#predictions.show(5)

In [None]:
# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Accuracy of model at predicting sentiment was: %f" % accuracy)

Accuracy of model at predicting sentiment was: 0.639405


In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [cleaned_final.schema['features'].metadata['ml_attr']['num_attrs'], 5, 4, 3]

In [None]:
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(training)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute accuracy on the test set
result = model.transform(testing)
predictionAndLabels = result.select("prediction", "label")

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

print("Accuracy of model at predicting sentiment was: " + str(evaluator.evaluate(result)))

Accuracy of model at predicting sentiment was: 0.6282527881040892
