In [0]:
# Load in one of the tables
df = spark.sql("select * from default.reviews_train")
df = df.sample(False, 0.3, seed=47)
df = df.cache()

In [0]:
df.printSchema()

root
 |-- reviewID: integer (nullable = true)
 |-- overall: double (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: integer (nullable = true)
 |-- label: integer (nullable = true)



In [0]:
# Convert Unix timestamp to readable date
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Data Cleaning
df = df.dropDuplicates(['reviewerID', 'asin'])
df = df.withColumn("reviewTime", to_date(from_unixtime(df.unixReviewTime))).drop("unixReviewTime")
df = df.withColumn("verified", df.verified.cast("int"))

In [0]:
import pyspark.sql.functions as f
# Feature Engineering 
df = df.withColumn("len", f.length("reviewText"))
df = df.withColumn('days', datediff(current_date(),col("reviewTime")))

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from sparknlp.base import *
from sparknlp.annotator import *

# string to index
indexer = StringIndexer(inputCols=["asin", "overall", "reviewerID"], outputCols=["asinIndex", "overallIndex", "reviewerIDIndex"], handleInvalid='keep')
#encoding
encoder = OneHotEncoder(inputCols=["asinIndex", "reviewerIDIndex"], outputCols=["asinVec", "reviewerIDVec"], handleInvalid='keep', dropLast=True)
# convert text column to spark nlp document
document_assembler = DocumentAssembler().setInputCol("reviewText").setOutputCol("document")
document_assembler2 = DocumentAssembler().setInputCol("summary").setOutputCol("document2")
# convert document to array of tokens
tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
tokenizer2 = Tokenizer().setInputCols(["document2"]).setOutputCol("token2")
# clean tokens 
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normalized").setLowercase(True)
normalizer2 = Normalizer().setInputCols(["token2"]).setOutputCol("normalized2").setLowercase(True)
# remove stopwords
stopwords_cleaner = StopWordsCleaner().setInputCols("normalized").setOutputCol("cleanTokens").setCaseSensitive(False)
stopwords_cleaner2 = StopWordsCleaner().setInputCols("normalized2").setOutputCol("cleanTokens2").setCaseSensitive(False)
# stems tokens to bring it to root form
lemmatizer = LemmatizerModel.pretrained().setInputCols(["cleanTokens"]).setOutputCol("lemma")
lemmatizer2 = LemmatizerModel.pretrained().setInputCols(["cleanTokens2"]).setOutputCol("lemma2")
# Convert custom document structure to array of tokens.
finisher = Finisher() \
    .setInputCols(["lemma", "lemma2"]) \
    .setOutputCols(["token_features", "token_features2"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

# tfidf
tf = CountVectorizer(inputCol="token_features", outputCol="rawFeatures", vocabSize=10000, minTF=1, minDF=50, maxDF=0.40)
idf = IDF(inputCol="rawFeatures", outputCol="idf")
tf2 = CountVectorizer(inputCol="token_features2", outputCol="rawFeatures2", vocabSize=10000, minTF=1, minDF=50, maxDF=0.40)
idf2 = IDF(inputCol="rawFeatures2", outputCol="idf2")

# Combine all features into one final "features" column
assembler = VectorAssembler(inputCols=['overallIndex', 'asinVec', 'reviewerIDVec', 'verified', 'days', 'len', "idf", "idf2"], outputCol="features", handleInvalid='keep')

# Model
#lr = LogisticRegression(maxIter=50, regParam=0.3, elasticNetParam=0.0)

# pipeline
pipeline = Pipeline(stages=[
    indexer, 
    encoder, 
    document_assembler,
    tokenizer,
    normalizer,
    stopwords_cleaner,
    lemmatizer,
    document_assembler2,
    tokenizer2,
    normalizer2,
    stopwords_cleaner2,
    lemmatizer2,
    finisher,
    tf,
    idf,
    tf2,
    idf2,
    assembler])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ][OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ][OK!]


In [0]:
# set seed for reproducibility
(trainingData, testingData) = df.randomSplit([0.8, 0.2], seed = 47)
# print("Training Dataset Count: " + str(trainingData.count()))
# print("Test Dataset Count: " + str(testingData.count()))

In [0]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
trainingDataTransformed = pipelineFit.transform(trainingData)

In [0]:
trainingDataTransformed.printSchema()

root
 |-- reviewID: integer (nullable = true)
 |-- overall: double (nullable = true)
 |-- verified: integer (nullable = true)
 |-- reviewTime: date (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- len: integer (nullable = true)
 |-- days: integer (nullable = true)
 |-- asinIndex: double (nullable = false)
 |-- overallIndex: double (nullable = false)
 |-- reviewerIDIndex: double (nullable = false)
 |-- asinVec: vector (nullable = true)
 |-- reviewerIDVec: vector (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- 

In [0]:
trainingDataTransformed = trainingDataTransformed.select(['reviewerIDVec', 'overallIndex', 'asinVec', 'verified', 'days', 'len', 'token_features', 'idf', 'idf2', 'features', 'label'])


In [0]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')
rfModel = rf.fit(trainingDataTransformed)

In [0]:
def ExtractFeatureImportance(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [0]:
import pandas as pd

ExtractFeatureImportance(rfModel.featureImportances, trainingDataTransformed, "features").head(10)



Unnamed: 0,idx,name,vals,score
2,623377,len,,0.049395
42,623417,idf_39,,0.037504
148,623523,idf_145,,0.035731
59,623434,idf_56,,0.028796
56,623431,idf_53,,0.027932
23,623398,idf_20,,0.026205
163,623538,idf_160,,0.026181
95,623470,idf_92,,0.024461
41,623416,idf_38,,0.023528
285,623660,idf_282,,0.022428


In [0]:
# Create a map between eachtoken and its index
from pyspark.sql.functions import explode, udf, col
from pyspark.sql.types import *

make_list_udf = udf(lambda col: [col], ArrayType(StringType()))
remove_list_udf = udf(lambda col: col[0], StringType())

def get_index(col):
    if len(col.indices) == 0:
        return -1   # Mark the ngram's index as -1 if it is not the top 2^12 ngrams
    else:
        return int(col.indices[0])
    
get_index_udf = udf(get_index, IntegerType())

token_index = trainingDataTransformed.select(explode(trainingDataTransformed.token_features).alias("token_features")).distinct() \
                             .withColumn("token_features", make_list_udf("token_features"))

tff = CountVectorizer(inputCol="token_features", outputCol="rawF", vocabSize=10000, minTF=1, minDF=50, maxDF=0.40)

trans = tff.fit(trainingDataTransformed)
token_index = trans.transform(token_index)

token_index = token_index.withColumn("token_features", remove_list_udf("token_features")) \
                         .withColumn("index", get_index_udf("rawF")) \
                         .select("token_features", "index")

In [0]:
token_index = trainingDataTransformed.select(explode(trainingDataTransformed.token_features).alias("token_features")).distinct() \
                             .withColumn("token_features", make_list_udf("token_features"))

tff = CountVectorizer(inputCol="token_features", outputCol="rawF", vocabSize=10000, minTF=1, minDF=50, maxDF=0.40)

trans = tff.fit(trainingDataTransformed)
token_index = trans.transform(token_index)

token_index = token_index.withColumn("token_features", remove_list_udf("token_features")) \
                         .withColumn("index", get_index_udf("rawF")) \
                         .select("token_features", "index")

In [0]:
display(token_index.where(token_index.index == 337))

token_features,index
create,337


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingDataTransformed)

In [0]:
def ExtractFeatureCoeficient(model, dataset, excludedCols = None):
    test = model.transform(dataset)
    weights = model.coefficients
    print('This is model weights: \n', weights)
    weights = [(float(w),) for w in weights]  # convert numpy type to float, and to tuple
    if excludedCols == None:
        feature_col = [f for f in test.schema.names if f not in ['y', 'classWeights', 'features', 'label', 'rawPrediction', 'probability', 'prediction']]
    else:
        feature_col = [f for f in test.schema.names if f not in excludedCols]
    if len(weights) == len(feature_col):
        weightsDF = sqlContext.createDataFrame(zip(weights, feature_col), schema= ["Coeficients", "FeatureName"])
    else:
        print('Coeficients are not matching with remaining Fetures in the model, please check field lists with model.transform(dataset).schema.names')
        weightsDF = sqlContext.createDataFrame(zip(weights, feature_col), schema= ["Coeficients", "FeatureName"])
    
    return weightsDF

In [0]:
results = ExtractFeatureCoeficient(lrModel, trainingDataTransformed, )

#results.show()

This is model weights: 
 [0.06216751647744364,-0.15019692256312153,-0.10734383779652808,-0.1309436593216342,-0.07815847636581028,0.2904329734027064,-0.13108224344847907,-0.17079565723595572,-0.12485187544118323,-0.11122586419105719,-0.13996059755554943,0.16559461048611665,-0.020076976404119193,-0.07900564012267201,-0.1714012273452696,-0.029361379174728707,-0.11772677650638752,-0.12540202426549657,-0.09564482243466162,-0.13702123814166967,-0.08149489307395034,-0.12948201837925524,-0.1596977724697624,0.14892750081747408,0.004093574759687719,-0.14049750777172418,0.0847682068209239,-0.0276406567287083,-0.09320048171500982,-0.11104403185016166,-0.4049099927996285,-0.14422594313615106,-0.15663665808727728,-0.04730808201665858,-0.08458191426359257,-0.09884259381541985,0.01772985700304731,-0.1096453997922272,-0.032915598062815146,-0.017051547516495416,-0.13177487377672048,-0.12816831708908072,-0.10484058405333181,-0.1479770585099694,-0.11355770410391641,-0.11206918640832113,-0.1133143556886473

In [0]:
results.sort(['Coeficients'],
               ascending = False).show()

+--------------------+--------------+
|         Coeficients|   FeatureName|
+--------------------+--------------+
|{0.2904329734027064}|           len|
|{0.06216751647744...| reviewerIDVec|
|{-0.0781584763658...|          days|
|{-0.1073438377965...|       asinVec|
|{-0.1248518754411...|          idf2|
|{-0.1309436593216...|      verified|
|{-0.1310822434484...|token_features|
|{-0.1501969225631...|  overallIndex|
|{-0.1707956572359...|           idf|
+--------------------+--------------+

