<a href="https://colab.research.google.com/github/Komal-londhe/BigDataProcessing_GooglePlayStore/blob/main/603_SparkMLlib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Machine Learning - SparkMLlib - to Predict the Ratings

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

In [None]:
spark = SparkSession.builder.master("local").appName("GooglePlayStoreAnalysis").getOrCreate()

In [None]:
GoogleDf = spark.read.csv("/FileStore/tables/GooglePlayStore.csv", header="true", inferSchema="true")

In [None]:
from pyspark.sql.functions import col
MLdata = GoogleDf.select('App Name','Rating','Rating Count').where((col("Rating") == 1.0) | (col("Rating") == 2.0) | (col("Rating") == 3.0) | (col("Rating") == 4.0) | (col("Rating") == 5.0))
MLdata.show()

+----------------------------------+------+------------+
|                          App Name|Rating|Rating Count|
+----------------------------------+------+------------+
|               Ampere Battery Info|   4.0|        64.0|
|              Smart City Trichy...|   5.0|         5.0|
|              unlimited 4G data...|   4.0|        12.0|
|              The Everyday Cale...|   2.0|        39.0|
|              Neon 3d Iron Tech...|   5.0|       820.0|
|                   Dodge The Cars!|   5.0|        55.0|
|桃園機場捷運時刻表 - 捷運轉乘路...|   4.0|       118.0|
|                 Caliway Conductor|   4.0|      1572.0|
|              Readymade Grocery...|   4.0|        16.0|
|              OTENTIK Discovery FR|   3.0|         5.0|
|              All in one shoppi...|   5.0|         6.0|
|                    REDMOND  Robot|   4.0|       328.0|
|              Block Fill: Puzzl...|   4.0|       211.0|
|              Coloring Book Bar...|   4.0|       736.0|
|              Random Number Gen...|   4.0|  

In [None]:
# now let's see how many categorical and numerical features we have:

cat_cols = [item[0] for item in MLdata.dtypes if item[1].startswith('string')] 
print(str(len(cat_cols)) + '  categorical features')

num_cols = [item[0] for item in MLdata.dtypes if item[1].startswith('int') | item[1].startswith('double')][1:]
print(str(len(num_cols)) + '  numerical features')

3  categorical features
0  numerical features


In [None]:
ML_data = spark.createDataFrame(MLdata.take(1000))
ML_data.show()

+----------------------------------+------+------------+
|                          App Name|Rating|Rating Count|
+----------------------------------+------+------------+
|               Ampere Battery Info|   4.0|        64.0|
|              Smart City Trichy...|   5.0|         5.0|
|              unlimited 4G data...|   4.0|        12.0|
|              The Everyday Cale...|   2.0|        39.0|
|              Neon 3d Iron Tech...|   5.0|       820.0|
|                   Dodge The Cars!|   5.0|        55.0|
|桃園機場捷運時刻表 - 捷運轉乘路...|   4.0|       118.0|
|                 Caliway Conductor|   4.0|      1572.0|
|              Readymade Grocery...|   4.0|        16.0|
|              OTENTIK Discovery FR|   3.0|         5.0|
|              All in one shoppi...|   5.0|         6.0|
|                    REDMOND  Robot|   4.0|       328.0|
|              Block Fill: Puzzl...|   4.0|       211.0|
|              Coloring Book Bar...|   4.0|       736.0|
|              Random Number Gen...|   4.0|  

In [None]:
ML_data.count()

Out[7]: 1000

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
categoricalColumns = ['App Name', 'Rating', 'Rating Count']
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol = 'Rating', outputCol = 'label')
stages += [label_stringIdx]
assemblerInputs = [c + "classVec" for c in categoricalColumns]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [None]:
# we use a pipeline to apply all the stages of tranformation to the data
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
cols = ML_data.columns
pipelineModel = pipeline.fit(ML_data)
df = pipelineModel.transform(ML_data)
selectedCols = ['label', 'features'] + cols
df = df.select(selectedCols)
df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- App Name: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Rating Count: string (nullable = true)



In [None]:
train, test = df.randomSplit([0.8, 0.2], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 795
Test Dataset Count: 205


#### Logistic Regression Model

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)

In [None]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

Coefficients: 
DenseMatrix([[-3.39399601e+00, -1.45999284e+00, -2.05061523e+00, ...,
               0.00000000e+00, -1.45999284e+00,  0.00000000e+00],
             [-3.14239581e+00, -7.54010799e-01, -1.19111310e+00, ...,
               0.00000000e+00, -7.54010799e-01,  0.00000000e+00],
             [-1.68350096e+00,  2.20517428e+00,  3.38771868e+00, ...,
               0.00000000e+00,  2.20517428e+00,  0.00000000e+00],
             [-9.65041885e-01,  2.67073459e-03, -1.22021716e-01, ...,
               0.00000000e+00,  2.67073459e-03,  0.00000000e+00],
             [ 9.18493466e+00,  6.15861883e-03, -2.39686345e-02, ...,
               0.00000000e+00,  6.15861883e-03,  0.00000000e+00]])
Intercept: [2.474377519199906,1.6271717600640112,0.11666501595924342,-1.2009555736635669,-3.0172587215595934]


In [None]:
predictions = lrModel.transform(test)
predictions.select('App Name', 'Rating', 'label','prediction', 'probability').show(10)

+--------------------+------+-----+----------+--------------------+
|            App Name|Rating|label|prediction|         probability|
+--------------------+------+-----+----------+--------------------+
|ACE Auto Club Europa|   4.0|  0.0|       0.0|[0.99515773994608...|
|Cute Theme Fluffy...|   4.0|  0.0|       0.0|[0.99515773994608...|
|Female Red Jungle...|   4.0|  0.0|       0.0|[0.99342216569064...|
|   Festival Art Rock|   4.0|  0.0|       0.0|[0.97804265777917...|
|       GPS Installer|   4.0|  0.0|       0.0|[0.99552908225739...|
|Internet Blocker:...|   4.0|  0.0|       0.0|[0.94949051321979...|
|       Mirror Mirror|   4.0|  0.0|       0.0|[0.99530734432755...|
|Peribahasa Bahasa...|   4.0|  0.0|       0.0|[0.99711665264411...|
|       Photo Collage|   4.0|  0.0|       0.0|[0.99515773994608...|
|   Pocket Bowling 3D|   4.0|  0.0|       0.0|[0.99515773994608...|
+--------------------+------+-----+----------+--------------------+
only showing top 10 rows



In [None]:
predictions.show()

+-----+--------------------+--------------------+------+------------+--------------------+--------------------+----------+
|label|            features|            App Name|Rating|Rating Count|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+------+------------+--------------------+--------------------+----------+
|  0.0|(1360,[14,999,117...|ACE Auto Club Europa|   4.0|      1592.0|[5.68682413714466...|[0.99515773994608...|       0.0|
|  0.0|(1360,[178,999,13...|Cute Theme Fluffy...|   4.0|       473.0|[5.68682413714466...|[0.99515773994608...|       0.0|
|  0.0|(1360,[263,999,10...|Female Red Jungle...|   4.0|        36.0|[5.61654699694366...|[0.99342216569064...|       0.0|
|  0.0|(1360,[264,999,10...|   Festival Art Rock|   4.0|        35.0|[4.73457426517428...|[0.97804265777917...|       0.0|
|  0.0|(1360,[298,999,10...|       GPS Installer|   4.0|         8.0|[5.77672638528512...|[0.99552908225739...|       0.0|
|  0.0|(1360,[36

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))


Test Error = 0.00487805 
