In [0]:
# Load in one of the tables
df = spark.sql("select * from default.reviews_train")
#df = df.sample(False, 0.3, seed=47)
df = df.cache()
#df.show(5)

In [0]:
# Convert Unix timestamp to readable date
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Data Cleaning
df = df.dropDuplicates(['reviewerID', 'asin'])
df = df.withColumn("reviewTime", to_date(from_unixtime(df.unixReviewTime))).drop("unixReviewTime")
df = df.withColumn("verified", df.verified.cast("int"))

In [0]:
import pyspark.sql.functions as f
# Feature Engineering 
df = df.withColumn("len", f.length("reviewText"))
df = df.withColumn('days', datediff(current_date(),col("reviewTime")))

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml.classification import LogisticRegression

# string to index
indexer = StringIndexer(inputCols=["asin", "overall"], outputCols=["asinIndex", "overallIndex"], handleInvalid='keep')
#encoding
encoder = OneHotEncoder(inputCols=["asinIndex", "overallIndex", "days", "len", "verified"], outputCols=["asinVec", "overallVec", "daysVec", "lenVec", "verifiedVec"], handleInvalid='keep', dropLast=True)
# convert text column to spark nlp document
document_assembler = DocumentAssembler().setInputCol("reviewText").setOutputCol("document")
document_assembler2 = DocumentAssembler().setInputCol("summary").setOutputCol("document2")
# sentece detector
sentence = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
sentence2 = SentenceDetector().setInputCols(["document2"]).setOutputCol("sentence2")
# convert document to array of tokens
tokenizer = regexTokenizer = RegexTokenizer().setInputCols(["sentence"]).setOutputCol("token").setToLowercase(True).setPattern("\\s+")
tokenizer2 = Tokenizer().setInputCols(["sentence2"]).setOutputCol("token2")
# clean tokens 
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized")
normalizer2 = Normalizer().setInputCols(["token2"]).setOutputCol("normalized2").setLowercase(True)
# remove stopwords
stopwords_cleaner = StopWordsCleaner().setInputCols("normalized").setOutputCol("cleanTokens").setCaseSensitive(False)
stopwords_cleaner2 = StopWordsCleaner().setInputCols("normalized2").setOutputCol("cleanTokens2").setCaseSensitive(False)
# stems tokens to bring it to root form
lemmatizer = LemmatizerModel.pretrained().setInputCols(["cleanTokens"]).setOutputCol("lemma")
lemmatizer2 = LemmatizerModel.pretrained().setInputCols(["cleanTokens2"]).setOutputCol("lemma2")
# Convert custom document structure to array of tokens.
finisher = Finisher() \
    .setInputCols(["lemma"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)
finisher2 = Finisher() \
    .setInputCols(["lemma2"]) \
    .setOutputCols(["token_features2"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)
# tfidf
tf = CountVectorizer(inputCol="token_features", outputCol="rawFeatures", vocabSize=10000, minTF=1, minDF=50, maxDF=0.40)
idf = IDF(inputCol="rawFeatures", outputCol="idf")
tf2 = CountVectorizer(inputCol="token_features2", outputCol="rawFeatures2", vocabSize=10000, minTF=1, minDF=50, maxDF=0.40)
idf2 = IDF(inputCol="rawFeatures2", outputCol="idf2")

# Combine all features into one final "features" column
assembler = VectorAssembler(inputCols=['overallVec', 'verifiedVec', 'asinVec', 'daysVec', 'lenVec', "idf", "idf2"], outputCol="features", handleInvalid='keep')

# Model
lr = LogisticRegression(maxIter=300, regParam=0.01, elasticNetParam=0.0)

# pipeline
pipeline = Pipeline(stages=[
    indexer, 
    encoder, 
    document_assembler,
    sentence,
    tokenizer,
    normalizer,
    stopwords_cleaner,
    lemmatizer,
    finisher,
    tf,
    idf,
    document_assembler2,
    sentence2,
    tokenizer2,
    normalizer2,
    stopwords_cleaner2,
    lemmatizer2,
    finisher2,
    tf2,
    idf2,
    assembler,
    lr])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ][OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ][OK!]


In [0]:
# # set seed for reproducibility
# (trainingData, testingData) = df.randomSplit([0.9, 0.1], seed = 47)
# # print("Training Dataset Count: " + str(trainingData.count()))
# # print("Test Dataset Count: " + str(testingData.count()))

In [0]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)

In [0]:
# from pyspark.ml.evaluation import BinaryClassificationEvaluator

# predictions = pipelineFit.transform(testingData)
# evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
# print('Test Area Under ROC', evaluator.evaluate(predictions))

In [0]:
# Load in the tables
test_df = spark.sql("select * from default.reviews_test")
#test_df.show(5)
#print((test_df.count(), len(test_df.columns)))

In [0]:
# Data Cleaning
test_df = test_df.withColumn("reviewTime", to_date(from_unixtime(test_df.unixReviewTime))).drop("unixReviewTime")
test_df = test_df.withColumn("verified", test_df.verified.cast("int"))
# Feature Engineering 
test_df = test_df.withColumn("len", f.length("reviewText"))
test_df = test_df.withColumn('days', datediff(current_date(),col("reviewTime")))

In [0]:
predictions = pipelineFit.transform(test_df)

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

probelement=udf(lambda v:float(v[1]),FloatType())
submission_data = predictions.select('reviewID', probelement('probability')).withColumnRenamed('<lambda>(probability)', 'label')

In [0]:
display(submission_data.select('reviewID', 'label'))

reviewID,label
80000001,0.010320797
80000002,0.0502033
80000003,0.01974466
80000004,0.17819633
80000005,0.74771786
80000006,0.41138318
80000007,0.12156868
80000008,0.16151527
80000009,0.09245284
80000010,0.8386177
