In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = "spark-3.2.0"
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [1 0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Co                                                                               Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [2 InRelease 20.0 kB/88.70% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.d

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [3]:
# Imports
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [4]:
# Read in data from S3 Buckets

from pyspark import SparkFiles
df = spark.read.csv(SparkFiles.get("/content/IMDB Dataset.csv"),sep=",", escape='"', encoding="utf-8", quote='"',  header=True)

# Show DataFrame
df.show(10, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# Tokenize DataFrame
# tokened = Tokenizer(inputCol="review", outputCol="words")
# tokened_transformed = tokened.transform(df)
# tokened_transformed.show()

In [8]:
# Remove stop words
# remover = StopWordsRemover(inputCol="words", outputCol="Wordsfiltered")
# removed_frame = remover.transform(tokened_transformed)
# removed_frame.show()

In [5]:
# # Run the hashing term frequency
# hashing = HashingTF(inputCol="Wordsfiltered", outputCol="hashedValues", numFeatures=pow(2,4))

# # Transform into a DF
# hashed_df = hashing.transform(removed_frame)
# hashed_df.show()

In [None]:
# # Fit the IDF on the data set 
# idf = IDF(inputCol="hashedValues", outputCol="features")
# idfModel = idf.fit(hashed_df)
# final_df = idfModel.transform(hashed_df)

In [9]:
# Show the TF-IDF features
# final_df.select("features").collect()

In [6]:
# Train test split
training, testing = df.randomSplit([0.7, 0.3])

In [7]:
# Imports
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [8]:
# Create all the steps for the pipeline
label_indexer = StringIndexer(inputCol='sentiment',outputCol='label')
tokenizer = Tokenizer(inputCol="review", outputCol="Wordsfiltered")
stopremove = StopWordsRemover(inputCol='Wordsfiltered',outputCol='hashedValues')
hashingTF = HashingTF(inputCol="hashedValues", outputCol='features')
lr = LogisticRegression(maxIter=20, regParam=0.001)

# Define pipeline
pipeline = Pipeline(stages=[label_indexer, tokenizer, stopremove, hashingTF, lr])

In [9]:
training.show()

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|\b\b\b\bA Turkish...| positive|
|!!!! MILD SPOILER...| negative|
|!!!! MILD SPOILER...| negative|
|" Så som i himmel...| positive|
|" While sporadica...| negative|
|"... the beat is ...| positive|
|"200l: A Space Od...| positive|
|"9/11," hosted by...| positive|
|"A Gentleman's Ga...| negative|
|"A Minute to Pray...| positive|
|"A Mouse in the H...| positive|
|"A Slight Case of...| positive|
|"A Tale of Two Si...| positive|
|"A Thief in the N...| positive|
|"A bored televisi...| negative|
|"A lot of the fil...| negative|
|"A research scien...| negative|
|"A total waste of...| negative|
|"A trio of treasu...| negative|
|"A truly nice sto...| positive|
+--------------------+---------+
only showing top 20 rows



In [10]:
training.dtypes


[('review', 'string'), ('sentiment', 'string')]

In [11]:
# Fit the pipeline to training reviews.
model = pipeline.fit(training)

In [12]:
# Tranform the model with the testing data
test_results = model.transform(testing)
test_results.show(5)

+--------------------+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              review|sentiment|label|       Wordsfiltered|        hashedValues|            features|       rawPrediction|         probability|prediction|
+--------------------+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|!!!! POSSIBLE MIL...| negative|  0.0|[!!!!, possible, ...|[!!!!, possible, ...|(262144,[6512,142...|[1.88497059378014...|[0.86818101659489...|       0.0|
|" Now in India's ...| positive|  1.0|[", now, in, indi...|[", india's, sunn...|(262144,[535,1765...|[-11.508502343245...|[1.00442282941249...|       1.0|
|"2001: A Space Od...| positive|  1.0|["2001:, a, space...|["2001:, space, o...|(262144,[1277,342...|[13.5600496343679...|[0.99999870894510...|       0.0|
|"8 SIMPLE RULES.....| positive|  1.0|["8, simple, rule...|["8, simple

In [13]:
# Evaluate model
f1_eval = MulticlassClassificationEvaluator(metricName='f1')
print("F1-score: ", f1_eval.evaluate(test_results))
accuracy_score = MulticlassClassificationEvaluator(metricName='accuracy')
print("ACC: ", accuracy_score.evaluate(test_results))

F1-score:  0.8549894901663047
ACC:  0.8550190852474385


In [14]:
# Tranform the model with the training data
train_results = model.transform(training)
train_results.show(5)


+--------------------+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              review|sentiment|label|       Wordsfiltered|        hashedValues|            features|       rawPrediction|         probability|prediction|
+--------------------+---------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|\b\b\b\bA Turkish...| positive|  1.0|[\b\b\b\ba, turki...|[\b\b\b\ba, turki...|(262144,[1277,201...|[-7.9456250841161...|[3.54082992643884...|       1.0|
|!!!! MILD SPOILER...| negative|  0.0|[!!!!, mild, spoi...|[!!!!, mild, spoi...|(262144,[3524,497...|[7.56773507770490...|[0.99948340523243...|       0.0|
|!!!! MILD SPOILER...| negative|  0.0|[!!!!, mild, spoi...|[!!!!, mild, spoi...|(262144,[4214,497...|[10.8208639721508...|[0.99998002210109...|       0.0|
|" Så som i himmel...| positive|  1.0|[", så, som, i, h...|[", så, som

In [15]:
# Evaluate model
f1_eval = MulticlassClassificationEvaluator(metricName='f1')
print("F1-score: ", f1_eval.evaluate(train_results))
accuracy_score = MulticlassClassificationEvaluator(metricName='accuracy')
print("ACC: ", accuracy_score.evaluate(train_results))

F1-score:  1.0
ACC:  1.0


In [15]:
# Define the pipeline for randomforestclassifier


KeyboardInterrupt: ignored

In [None]:
# Predict with the test dataset
predictions = cvModel.transform(testing)