In [None]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-2.4.6'
spark_version = 'spark-2.4.7'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:13 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:14 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Fetched 3,626 B in 2s (1,708 B/s)
Reading package 

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PerceptronModels").getOrCreate().set('spark.executor.memory', '10G').set('spark.driver.memory', '10G').set('spark.driver.maxResultSize', '10G'))

In [None]:
# Read in data from Github
from pyspark import SparkFiles
url ="https://raw.githubusercontent.com/James-Ashley/sentiment-analysis-dashboard/main/sentiment_classification/preprocessed_headlines.json"
spark.sparkContext.addFile(url)
df_git = spark.read.json(SparkFiles.get("preprocessed_headlines.json"))

In [None]:
df_git.show()

+--------------------+--------------------+--------------+-----------+--------------+-------------+--------------+--------------------+---------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              author|   cleaned_headlines|compound_score|    keyword|negative_score|neutral_score|positive_score|           published|sentiment_human|  source|       text_complete|        text_excerpt|               tfidf|               title|              tokens|         tokens_lems|                 url|
+--------------------+--------------------+--------------+-----------+--------------+-------------+--------------+--------------------+---------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           Dip Patel|could deport pare...|        0.1027|immigration|      

### Feature Transformations


In [None]:
df_git_clean = df_git.select('sentiment_human', 'tokens_lems').withColumnRenamed('sentiment_human', 'label')

df_git_clean.show()

+-----+--------------------+
|label|         tokens_lems|
+-----+--------------------+
|   -1|[could, deport, p...|
|    0|[first, latino, t...|
|    0|[tony, pham, inte...|
|    0|[two, third, undo...|
|   -1|[biden, meet, str...|
|   -1|[accuse, hate, gr...|
|    1|[lawyer, say, tru...|
|   -1|[28, migrant, chi...|
|   -1|[visa, delay, lea...|
|    0|[advocate, mental...|
|    1|[u, k, first, non...|
|    1|[12, great, latin...|
|    0|[feinstein, say, ...|
|    0|[judge, throw, tr...|
|   -1|[san, francisco, ...|
|    0|[latino, catholic...|
|   -1|[covid, 19, regul...|
|    0|[pope, installs, ...|
|    0|[trump, immigrati...|
|    1|[trump, administr...|
+-----+--------------------+
only showing top 20 rows



In [None]:
from pyspark.sql.functions import when

df_git_clean = df_git_clean.withColumn("label", \
              when(df_git_clean["label"] == -1, 2).otherwise(df_git_clean["label"]))

df_git_clean.show()

+-----+--------------------+
|label|         tokens_lems|
+-----+--------------------+
|    2|[could, deport, p...|
|    0|[first, latino, t...|
|    0|[tony, pham, inte...|
|    0|[two, third, undo...|
|    2|[biden, meet, str...|
|    2|[accuse, hate, gr...|
|    1|[lawyer, say, tru...|
|    2|[28, migrant, chi...|
|    2|[visa, delay, lea...|
|    0|[advocate, mental...|
|    1|[u, k, first, non...|
|    1|[12, great, latin...|
|    0|[feinstein, say, ...|
|    0|[judge, throw, tr...|
|    2|[san, francisco, ...|
|    0|[latino, catholic...|
|    2|[covid, 19, regul...|
|    0|[pope, installs, ...|
|    0|[trump, immigrati...|
|    1|[trump, administr...|
+-----+--------------------+
only showing top 20 rows



In [None]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer
# Create all the features for the data set

# hashing
hashingTF = HashingTF(inputCol="tokens_lems", outputCol='hash_token')
# idf
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors - this assemble all columns you want to use as features
clean_up = VectorAssembler(inputCols=['idf_token'], outputCol='features')

In [None]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[hashingTF, idf, clean_up])

In [None]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(df_git_clean)
cleaned = cleaner.transform(df_git_clean)
cleaned.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|         tokens_lems|          hash_token|           idf_token|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|    2|[could, deport, p...|(262144,[9514,813...|(262144,[9514,813...|(262144,[9514,813...|
|    0|[first, latino, t...|(262144,[2437,638...|(262144,[2437,638...|(262144,[2437,638...|
|    0|[tony, pham, inte...|(262144,[13981,31...|(262144,[13981,31...|(262144,[13981,31...|
|    0|[two, third, undo...|(262144,[15664,31...|(262144,[15664,31...|(262144,[15664,31...|
|    2|[biden, meet, str...|(262144,[6355,491...|(262144,[6355,491...|(262144,[6355,491...|
|    2|[accuse, hate, gr...|(262144,[54330,59...|(262144,[54330,59...|(262144,[54330,59...|
|    1|[lawyer, say, tru...|(262144,[7612,138...|(262144,[7612,138...|(262144,[7612,138...|
|    2|[28, migrant, chi...|(262144,[9514,329...|(262144,[9514,329...|(262144,[9

In [None]:
# Show label and resulting features
cleaned_final = cleaned.select(['label', 'features'])
cleaned_final.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|(262144,[9514,813...|
|    0|(262144,[2437,638...|
|    0|(262144,[13981,31...|
|    0|(262144,[15664,31...|
|    2|(262144,[6355,491...|
|    2|(262144,[54330,59...|
|    1|(262144,[7612,138...|
|    2|(262144,[9514,329...|
|    2|(262144,[13471,34...|
|    0|(262144,[34389,49...|
|    1|(262144,[26616,37...|
|    1|(262144,[37834,99...|
|    0|(262144,[46332,62...|
|    0|(262144,[7612,518...|
|    2|(262144,[39964,72...|
|    0|(262144,[36449,45...|
|    2|(262144,[4920,173...|
|    0|(262144,[23456,29...|
|    0|(262144,[7612,556...|
|    1|(262144,[7612,251...|
+-----+--------------------+
only showing top 20 rows



In [None]:
# Break data down into a training set and a testing set (train with 70%, test with 30%)
# Would be better to do a stratified split to preserve class distribution
training, testing = cleaned_final.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [cleaned_final.schema['features'].metadata['ml_attr']['num_attrs'], 5, 4, 3]

In [None]:
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(training)

# potential solution: https://stackoverflow.com/questions/42301111/dimension-mismatch-error-in-spark-ml

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45087)
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-eea6fc5d9add>", line 5, in <module>
    model = trainer.fit(training)
  File "/content/spark-2.4.7-bin-hadoop2.7/python/pyspark/ml/base.py", line 132, in fit
    return self._fit(dataset)
  File "/content/spark-2.4.7-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 295, in _fit
    java_model = self._fit_java(dataset)
  File "/content/spark-2.4.7-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 292, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/content/spark-2.4.7-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/cont

Py4JNetworkError: ignored

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# compute accuracy on the test set
result = model.transform(testing)
predictionAndLabels = result.select("prediction", "label")

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

print("Test set accuracy = " + str(evaluator.evaluate(result)))

#TODO: figure out this error

Py4JJavaError: ignored