In [0]:
# Load in one of the tables
df = spark.sql("select * from default.reviews_train").sample(0.001)
#df = df.sample(0.01, seed = 47)
df = df.cache()
print((df.count(), len(df.columns)))

(3151, 11)


In [0]:
# Drop duplicates
print("Before duplication removal: ", df.count())
df_distinct = df.dropDuplicates(['reviewerID', 'asin'])
print("After duplication removal: ", df.count())

Before duplication removal:  3151
After duplication removal:  3151


In [0]:
# Convert Unix timestamp to readable date
from pyspark.sql.functions import from_unixtime, to_date
from pyspark.sql.types import *

df_with_date = df_distinct.withColumn("reviewTime", to_date(from_unixtime(df_distinct.unixReviewTime))) \
                                                .drop("unixReviewTime")

In [0]:
from pyspark.sql.functions import col

# Combine review text and summary
from pyspark.sql.functions import concat, lit
new_df = df_with_date.withColumn("review",concat(col("reviewText"),col("summary"))).drop('reviewText').drop('summary')

In [0]:
from pyspark.sql.functions import dayofweek, month
new_df = new_df.withColumn('dayofweek', dayofweek(col('reviewTime'))).withColumn('month', month(col('reviewTime'))).drop('reviewTime')

In [0]:
from pyspark.sql.functions import col,length,trim
new_df = new_df.withColumn("review_len", length(col("review")))

In [0]:
new_df = new_df.na.drop(subset=["review", "label"])
new_df = new_df.na.fill(value='noinfo',subset=["asin", "reviewerID"])

In [0]:
new_df.printSchema()

root
 |-- reviewID: integer (nullable = true)
 |-- overall: double (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- reviewerID: string (nullable = false)
 |-- asin: string (nullable = false)
 |-- reviewerName: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- review: string (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- review_len: integer (nullable = true)



In [0]:
# one-hot-encoding
from pyspark.ml.feature import *

asin_indexer = StringIndexer(inputCol='asin', outputCol='asinIndex', handleInvalid='keep')
asin_indexer_model = asin_indexer.fit(new_df)
new_df = asin_indexer_model.transform(new_df)

asin_encoder = OneHotEncoder(inputCol='asinIndex', outputCol='asinVec')
asin_encoder_model = asin_encoder.fit(new_df)
new_df = asin_encoder_model.transform(new_df)

id_indexer = StringIndexer(inputCol='reviewerID', outputCol='idIndex', handleInvalid='keep')
id_indexer_model = id_indexer.fit(new_df)
new_df = id_indexer_model.transform(new_df)

id_encoder = OneHotEncoder(inputCol='idIndex', outputCol='idVec')
id_encoder_model = id_encoder.fit(new_df)
new_df = id_encoder_model.transform(new_df)

In [0]:
from sparknlp.base import *
from sparknlp.annotator import *

document_assembler = DocumentAssembler().setInputCol("review").setOutputCol("document")
new_df = document_assembler.transform(new_df)

In [0]:
# get sentences from the documents
sentence = SentenceDetector().setInputCols("document").setOutputCol("sentence").setCustomBounds(["\n\n"])
new_df = sentence.transform(new_df)    

In [0]:
# convert sentences to array of tokens
tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")
model_tokenizer = tokenizer.fit(new_df)
new_df = model_tokenizer.transform(new_df)

In [0]:
# clean tokens 
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized").setLowercase(True)
model_normalizer = normalizer.fit(new_df)
new_df = model_normalizer.transform(new_df)

In [0]:
# remove stopwords
stopwords_cleaner = StopWordsCleaner().setInputCols("normalized").setOutputCol("cleanTokens").setCaseSensitive(False) 
new_df = stopwords_cleaner.transform(new_df)

In [0]:
# lemmatization
lemmatizer = LemmatizerModel.pretrained().setInputCols(["cleanTokens"]).setOutputCol("lemma")
new_df = lemmatizer.transform(new_df)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ][ / ][OK!]


In [0]:
# Tag tokens with POS tags
pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'token']).setOutputCol('pos')
new_df = pos_tagger.transform(new_df)

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[ | ][ / ][ — ][OK!]


In [0]:
# Extract meaningful n-grams
chunker = Chunker().setInputCols(['document', 'pos']).setOutputCol('chunks').setRegexParsers(["<NNP>+", "<NNS>+", "<JJ>+<NN>"])
new_df = chunker.transform(new_df)

In [0]:
# Convert custom document structure to array of tokens.
finisher = Finisher().setInputCols(["lemma", "chunks"]).setOutputCols(["token_features", "chunk_features"]).setOutputAsArray(True).setCleanAnnotations(False)
new_df = finisher.transform(new_df)

In [0]:
# create embeddings
embeddings =Word2VecModel.pretrained().setInputCols("token").setOutputCol("embeddings")
new_df = embeddings.transform(new_df)

word2vec_gigaword_300 download started this may take some time.
Approximate size to download 312.3 MB
[ | ][ / ][ — ][ \ ][OK!]


In [0]:
#combine word embeddings
embeddingsSentence = SentenceEmbeddings().setInputCols(["document", "embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE").setDimension(300)
new_df = embeddingsSentence.transform(new_df)

In [0]:
# convert embeddings into Vectors to add to Assembler

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import explode

toVectorUDF = udf(lambda vs: Vectors.dense(vs), VectorUDT())
new_df = new_df.select(['reviewID', 'overall', 'verified', 'reviewerID', 'asin', 'reviewerName', 'label', 'review', 'dayofweek', 'month', 'review_len', 'asinIndex', 'asinVec', 'idIndex', 'idVec', 'document', 'sentence', 'token', 'normalized', 'cleanTokens', 'lemma', 'pos', 'chunks', 'token_features', 'chunk_features', 'embeddings', explode("sentence_embeddings.embeddings").alias("sentence_embedding")]).withColumn("final_embeddings", toVectorUDF("sentence_embedding"))

In [0]:
# Generate Term Frequency
tf = HashingTF(inputCol="token_features", outputCol="rawFeatures")
new_df = tf.transform(new_df)

tf_chunk = HashingTF(inputCol="chunk_features", outputCol="rawFeaturesChunks")
new_df = tf_chunk.transform(new_df)

In [0]:
# Generate Inverse Document Frequency weighting
idf = IDF(inputCol="rawFeatures", outputCol="idfFeatures", minDocFreq=5)
model_idf = idf.fit(new_df)
new_df = model_idf.transform(new_df)

idf_chunk = IDF(inputCol="rawFeaturesChunks", outputCol="idfFeaturesChunks", minDocFreq=5)
model_idf_chunk = idf_chunk.fit(new_df)
new_df = model_idf_chunk.transform(new_df)

In [0]:
from pyspark.ml import Pipeline

# provide hint about the size of ebeddings to assembler
sizeHint = VectorSizeHint(inputCol="final_embeddings", size=300, handleInvalid="skip")

# Combine all features into one final "features" column
assembler = VectorAssembler(inputCols=["verified", "overall", "dayofweek", "month", "review_len", "asinVec", "idVec", "idfFeatures", "idfFeaturesChunks", "final_embeddings"], outputCol="features", handleInvalid='keep')

# combine above to get final features column
pipeline = Pipeline(stages=[sizeHint, assembler])

pipelineModel = pipeline.fit(new_df)
new_df = pipelineModel.transform(new_df)

In [0]:
# set seed for reproducibility
(trainingData, testingData) = new_df.randomSplit([0.8, 0.2], seed = 47)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testingData.count()))

Training Dataset Count: 2535
Test Dataset Count: 614


In [0]:
from pyspark.ml.classification import LogisticRegression

# More classification docs: https://spark.apache.org/docs/latest/ml-classification-regression.html

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

In [0]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

print("Training Accuracy:  " + str(trainingSummary.accuracy))
print("Training Precision: " + str(trainingSummary.precisionByLabel))
print("Training Recall:    " + str(trainingSummary.recallByLabel))
print("Training FMeasure:  " + str(trainingSummary.fMeasureByLabel()))
print("Training AUC:       " + str(trainingSummary.areaUnderROC))

Training Accuracy:  0.9968391939944686
Training Precision: [0.9961704164672092, 1.0]
Training Recall:    [1.0, 0.9822222222222222]
Training FMeasure:  [0.9980815347721822, 0.9910313901345292]
Training AUC:       1.0


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = lrModel.transform(testingData)
predictions.show(5)

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print('Test Area Under ROC', evaluator.evaluate(predictions))

+--------+-------+--------+--------------+----------+----------------+-----+--------------------+---------+-----+----------+---------+------------------+-------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|reviewID|overall|verified|    reviewerID|      asin|    reviewerName|label|              review|dayofweek|month|review_len|asinIndex|           asinVec|idIndex|              idVec|            document|            sentence|               token|          normalized|         cleanTokens|               lemma|                 pos|              chunks|      token_features|      chunk_featu

In [0]:
# Load in the tables
test_df = spark.sql("select * from default.reviews_test")
test_df.show(5)
print((test_df.count(), len(test_df.columns)))

+--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+
|reviewID|overall|verified| reviewTime|    reviewerID|      asin|reviewerName|          reviewText|             summary|unixReviewTime|
+--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+
|80000001|    4.0|   false|07 27, 2015|A1JGAP0185YJI6|0700026657|      travis|I played it a whi...|But in spite of t...|    1437955200|
|80000002|    5.0|    true| 03 3, 2014|A1WK5I4874S3O2|0700026657|  WhiteSkull|I bought this gam...|A very good game ...|    1393804800|
|80000003|    5.0|    true|01 12, 2013|A1YDQQJDRHM0FJ|0001713353|       Leila|I am very happy w...|One of our famili...|    1357948800|
|80000004|    5.0|    true|11 20, 2011|A2E6AHFDJ3JBAZ|0681795107|    robosolo|I purchased two o...|Insulated stainle...|    1321747200|
|80000005|    5.0|   false|06 28, 2011|A38NXTZUF

In [0]:
# Convert Unix timestamp to readable date
test_df_with_date = test_df.withColumn("reviewTime", to_date(from_unixtime(test_df.unixReviewTime))) \
                                                .drop("unixReviewTime")

# Combine review text and summary
new_test_df = test_df_with_date.withColumn("review",concat(col("reviewText"),col("summary"))).drop('reviewText').drop('summary')

new_test_df = new_test_df.withColumn('dayofweek', dayofweek(col('reviewTime'))).withColumn('month', month(col('reviewTime'))).drop('reviewTime')
display(new_test_df)

new_test_df = new_test_df.withColumn("review_len", length(col("review")))

reviewID,overall,verified,reviewerID,asin,reviewerName,review,dayofweek,month
80000001,4.0,False,A1JGAP0185YJI6,0700026657,travis,"I played it a while but it was alright. The steam was a bit of trouble. The more they move these game to steam the more of a hard time I have activating and playing a game. But in spite of that it was fun, I liked it. Now I am looking forward to anno 2205 I really want to play my way to the moon.But in spite of that it was fun, I liked it",2.0,7.0
80000002,5.0,True,A1WK5I4874S3O2,0700026657,WhiteSkull,"I bought this game thinking it would be pretty cool and that i might play it for a week or two and be done. Boy was I wrong! From the moment I finally got the gamed Fired up (the other commentors on this are right, it takes forever and u are forced to create an account) I watched as it booted up I could tell right off the bat that ALOT of thought went into making this game. If you have ever played Sim city, then this game is a must try as you will easily navigate thru it and its multi layers. I have been playing htis now for a month straight, and I am STILL discovering layers of complexity in the game. There are a few things in the game that could used tweaked, but all in all this is a 5 star game.A very good game balance of skill with depth of choices",2.0,3.0
80000003,5.0,True,A1YDQQJDRHM0FJ,0001713353,Leila,I am very happy with the book!!! It is one of my children's favorite books and I was so pleased I could order it here!!!One of our families favorite books!!!,7.0,1.0
80000004,5.0,True,A2E6AHFDJ3JBAZ,0681795107,robosolo,"I purchased two of these cups for camping but I like them so much I use one daily. Unlike other thermos-type cups, this one actually keeps the beverage hot for quite some time, especially if you place the cap on it. The cap itself is really unique in the way it works. After filling with a hot liquid the cap fits rather loosely in place. But, after a few seconds heat expands it and it wedges itself in place very securely. You can still pry it off but not easily when it's hot which is just what you'd want. Also, that cap has a neat little sliding anti-splash tab on the top that actually works. The cup is wide enough to be stable sitting on a flat surface - with or without liquid in it - yet fits easily into a car tray cup holder for travel. After using the cup for a few days I decided that I didn't want the nylon handle. I simply used a heat gun (on low) to gently heat it up and just pulled it down and off of the cup. That handle is secured to the cup with some glue and a slight ridge running around its inner circumference. It was easy to clean off the remaining glue from the stainless steel and I could - if I wanted to - re-glue and reattach the handle. This is a great insulated stainless steel cup with or without a handle and well worth the price.Insulated stainless steel cup that actually works",1.0,11.0
80000005,5.0,False,A38NXTZUFB1O2K,0700099867,FiSH,"I'm not quite finished with the game's DiRT Tour mode, but I believe I've experienced the bulk of what the game has to offer. And I'm happy to say that the game is indeed awesome. Great cars, great tracks and racing modes, excellent gameplay and graphics. The highlight has been racing in the snow on various tracks in Aspen. There are also some really cool obstacle courses on pavement, which are new to the series. At first, I felt that the game was plagued with the same difficulty spike DiRT 2 had, where you can win any race without trying on casual difficulty, then lose big time on medium. Then I found the custom difficulty settings where you can choose from 5 levels of challenge, then pick and choose from all the other assists and handling options. Basically, you can let the game play itself by just holding the throttle and steering, or take gradual steps to reach what you feel works best for you. Love this feature! Sadly, there is no option to increase the 360 controller's dead zone or adjust the sensitivity. I'm doing ok as is, but some tweaking would be ideal. Much like the previous two games, the handling is a bit flighty. But that's speeding on loose ground for you. Still, I feel things have tightened up somewhat since DiRT 2. Still no cigar, but it's a step forward. The game runs as smooth as silk on my 560ti wih all the settings maxed. No issues whatsoever. If you enjoy racing games, this is not one you should ignore. It's the best DiRT so far.Best in the series!",3.0,6.0
80000006,1.0,True,A1INA0F5CWW3J4,0700099867,Amazon Shopper,1st shipment received a book instead of the game. 2nd shipment got a FAKE one. Game arrived with a wrong key inside on sealed box. I got in contact with codemasters and send them pictures of the DVD and the content. They said nothing they can do its a fake DVD. Returned it good bye.!Wrong key,7.0,6.0
80000007,2.0,True,A1BHRNLW2L8KLD,0700099867,T. Hanson,"The game itself is great, but Games for Windows Live is such a frustrating piece of garbage. I wish it would just die and stop bringing down my PC gaming experience.Games for Windows Live...",5.0,7.0
80000008,4.0,False,A3J9C2WMW0TZYB,0700099867,R. Lew,"I really enjoyed the Dirt series of racing games. I actually started with the Colin McRae Rally 3 demo, which came before Dirt. Dirt 1 was a nice game with lots of different types of races in varied environments, including Pike's Peak. You got to race semi trucks, cargo trucks, trophy trucks, SUVs, and rally cars in Dirt 1. The advancement was like a pyramid, unlocking levels as you go. I have a Toyota Tundra, so it is easy to like a game that has a Toyota Tundra to race in it. Dirt 2 made it more radical with nice music and an attitude. The races were longer and became more X-Games like. I enjoyed the Baja races very much. But, I did not like the Microsoft Live part of it where it takes a long time to get initiated, patched, etc before you can actually play for the first time. Once you get past this, things get better. The choice of vehicles were more limited, but the liveries try to make up for it. Dirt 3 keeps the same attitude and cool music, but also has the same Microsoft Live treatment: sign in, patch , etc. before the first game time. Once you get past this, it does get better. At first, I could not get used to the way the game downshifts automatically in the turns to automatically slow you down. I moved the difficulty up a notch to intermediate and it was much better and more of what I was expecting. I use a gamepad to play on the PC. I was pleasantly surprised that my 9800 GTX 512 MB plays this game rather well at 1680x1050 (no AA). The races are largely rally with some interesting winter-time truck races in the snow. The cars are not quite as varied as I would like. I also hate that there is in-game content that can be purchased additionally from Microsoft Live. Come on! I already paid for the game and there are some levels that are pay to play, so you won't really finish the season until you pony up more funds to Microsoft Live. Many of the cars are available to purchase too. That is cheesy and not appreciated. You don't need to purchase these to advance though. The gymkhana events are interesting and challenging. Being a hoonigan takes practice, patience, and many banged up cars. The Ken Block videos on youtube really inspired me to practice as I had no idea what gymkhana was before Dirt 3. I do miss the Baja style of offroading from Dirt 2. Dirt 3 seems to have a focus on other things, but it is still fun overall. For $12, I got my money's worth. Maybe I will get Dirt Showdown for some demolition derby excitement, or maybe not.Dirt, Dirt 2, and Dirt 3",5.0,12.0
80000009,5.0,True,A1FZV5FWLJKYU7,1581174292,L. Stephens,"I originally only knew this story/song in French! Now my baby can also learn it in English. She's only 3 months, but she enjoys us reading it in a sing-song way and doing sound effects for the animals. It's short enough to keep her attention and let us finish this one. Very cute!Fun for my baby girl",6.0,5.0
80000010,4.0,False,A20DRRKAN5Z9Q,0700099867,Moviedude,"In today's game market it's easy to miss racing games that aren't part of the license-heavy Forza or Gran Turismo franchises. Dirt 3 for PC is in an even worse boat than most since it's been a giveaway for a couple years now with every DX11 compliant ATI video card. As a result a lot of people who get it send it straight to eBay and most people who do play it have not bought it new. This is a shame because Dirt 3 is a fun little game. There's a great assortment of multinational tracks, and a wide array of classic and modern rally cars to choose from. As a rally game, Dirt 3 ditches the comfortable confines of tracks and sends you headlong across snow and gravel in a world where keeping control of your car is a greater skill than making it around the turns. Dirt 3 wears a veneer of noisy, fun arcade-style game play, but under the hood it calls for quick reactions, a good understanding of road surfaces, and nerves of steel. Sadly, it also calls for a gamepad or wheel- you will almost certainly find the keyboard too all-or-nothing for serious racing. PROS: -Visually stunning engine that makes full use of DirectX 11 effects -Soundtrack is an appealing mix of classic prog rock and modern music that does a good job of being vaguely European and not too intrusive -Successfully models an arcade-like complexity of play with a surprisingly deep simulation of track conditions and vehicle handling -In-car camera gives a stomach-churning stream of nonstop thrills as you launch over crests and 2-wheel hairpins -Probably the most realistic depiction of drifting/powersliding to appear in a video game (it's frightening, nauseating, and usually a bad idea) -3D support! CONS: -Lower difficulties aren't much fun, highest difficulty is brutally hard -Small fanbase of hardcore racing nuts may make multiplayer a bit hard for newcomers (Dirt 3 players seem pretty nice though) -Essentially requires some sort of analog input -May confuse players who aren't familiar with the rules and conditions of rally-style racing Other thoughts: -You need to plug your youtube credentials into the game to let it export your highlights to YouTube. I don't think CodeMasters wants to rip you off, but if it makes you uncomfortable, create a new Google identity for use with Dirt 3. Easy! -LEARN THE TRACKS. Learn the tracks, learn the tracks, learn the tracks. -If you want to win, employ a Fin!An overlooked gem in the Forza/GT treasure trove",2.0,3.0


In [0]:
new_test_df = asin_indexer_model.transform(new_test_df)
new_test_df = asin_encoder_model.transform(new_test_df)

new_test_df = id_indexer_model.transform(new_test_df)
new_test_df = id_encoder_model.transform(new_test_df)


new_test_df = document_assembler.transform(new_test_df)
new_test_df = sentence.transform(new_test_df) 
new_test_df = model_tokenizer.transform(new_test_df)
new_test_df = model_normalizer.transform(new_test_df)
new_test_df = stopwords_cleaner.transform(new_test_df)
new_test_df = lemmatizer.transform(new_test_df)
new_test_df = pos_tagger.transform(new_test_df)
new_test_df = chunker.transform(new_test_df)
new_test_df = finisher.transform(new_test_df)

new_test_df = embeddings.transform(new_test_df)
new_test_df = embeddingsSentence.transform(new_test_df)


In [0]:
toVectorUDF = udf(lambda vs: Vectors.dense(vs), VectorUDT())
new_test_df = new_test_df.select(['reviewID', 'overall', 'verified', 'reviewerID', 'asin', 'reviewerName', 'review', 'dayofweek', 'month', 'review_len', 'asinIndex', 'asinVec', 'idIndex', 'idVec', 'document', 'sentence', 'token', 'normalized', 'cleanTokens', 'lemma', 'pos', 'chunks', 'token_features', 'chunk_features', 'embeddings', explode("sentence_embeddings.embeddings").alias("sentence_embedding")]).withColumn("final_embeddings", toVectorUDF("sentence_embedding"))

In [0]:
new_test_df = tf.transform(new_test_df)
new_test_df = tf_chunk.transform(new_test_df)

new_test_df = model_idf.transform(new_test_df)
new_test_df = model_idf_chunk.transform(new_test_df)

In [0]:
new_test_df = pipelineModel.transform(new_test_df)

In [0]:
predictions = lrModel.transform(new_test_df)

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

probelement=udf(lambda v:float(v[1]),FloatType())
submission_data = predictions.select('reviewID', probelement('probability')).withColumnRenamed('<lambda>(probability)', 'label')

In [0]:
display(submission_data.select('reviewID', 'label'))

reviewID,label
80000001,0.059105594
80000002,0.1383934
80000003,0.047651574
80000004,0.39824563
80000005,0.73274475
80000006,0.15521604
80000007,0.05318258
80000008,0.37112603
80000009,0.28724226
80000010,0.72233564
