In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg,when
import hashlib
from pyspark.sql.functions import udf

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm 

from pyspark.sql.types import *
from pyspark.sql import Row

from pyspark.ml.feature import StringIndexer

### Tokenizers
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
## convert to word v review matrix 
from pyspark.ml.feature import HashingTF, IDF
##Naive Bayes
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MultilabelMetrics,MulticlassMetrics

In [2]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("best_one").\
        getOrCreate()

In [9]:
# загрузим данные и сразу сделаем разделение на выборки

review = spark.createDataFrame(pd.read_json('data/reviews.json'))

(review,spill) = review.randomSplit([0.40,0.60])

review = review.select('user_id',
                       'stars',
                       'business_id',
                       'text' )

review.show(5)

+--------------------+-----+--------------------+--------------------+
|             user_id|stars|         business_id|                text|
+--------------------+-----+--------------------+--------------------+
|7jc9f2Nn2S--5b-G5...|    4|_jYt69Zx1SUo_V9z0...|If I could give 3...|
|byro3oSQQ1gRESKlf...|    2|ch1ercqwoNLpQLxpT...|Highlights: Recen...|
|Eypq5gLLjCapBVVnM...|    4|hSFmjAeFlsltnIowd...|Strong diner food...|
|I0pfui2bGR9R6aodF...|    4|pc1PkDBggUSNU73jO...|This was truly my...|
|vNH1hOEqSphDpCfBf...|    1|qjI4sbA1xa8B9F81L...|I decided to take...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [35]:
# дополнительные функции для обработки данныз

# создание своего hash
def fix_ids(s):
    return int(int(hashlib.sha1(str(s).encode('utf8')).hexdigest(), 16) % 179426083)

fix_ids_udf = udf(fix_ids)

# decimil -> float
def fix_decimal_values(s):
    return round(s,2)

fix_round_udf = udf(fix_decimal_values)

# в int
def conv_to_int(v):
    return int(v)

to_int_udf = udf(conv_to_int)

In [13]:
business = spark.createDataFrame(pd.read_json('data/business.json'))
business = business.select('business_id','name','city','categories')
business.show(5)

+--------------------+--------------------+-----------+--------------------+
|         business_id|                name|       city|          categories|
+--------------------+--------------------+-----------+--------------------+
|6iYb2HFDywm3zjuRg...| Oskar Blues Taproom|    Boulder|Gastropubs, Food,...|
|tCbdrRPZA0oiIYSmH...|Flying Elephants ...|   Portland|Salad, Soup, Sand...|
|bvN78flM8NLprQ1a1...|      The Reclaimory|   Portland|Antiques, Fashion...|
|oaepsyvc0J17qwi8c...|         Great Clips|Orange City|Beauty & Spas, Ha...|
|PE9uqAjdw0E4-8mjG...|   Crossfit Terminus|    Atlanta|Gyms, Active Life...|
+--------------------+--------------------+-----------+--------------------+
only showing top 5 rows



In [48]:
df = review.join(business,'business_id')
df.show(5)

+--------------------+--------------------+-----+--------------------+--------------------+---------+--------------------+
|         business_id|             user_id|stars|                text|                name|     city|          categories|
+--------------------+--------------------+-----+--------------------+--------------------+---------+--------------------+
|8oz6JU_1D8PaLDNvq...|AIuBTJZ0WlpHSnE-T...|    5|This is my second...|Gorin Plastic Sur...| Tualatin|Health & Medical,...|
|Agq4zoNLSIpT1_ZJb...|PR-NRdbE4Fnaq-kUL...|    4|The cost isn't ex...|        Donut Palace| Portland|        Food, Donuts|
|r-hWf-bd9im5rj_l2...|-FZBTkAZEXoP7CYvR...|    4|My buddy was visi...|     Take Five CafГ©|Vancouver|Food, Coffee & Te...|
|6sSXBu_PWwLcw12PY...|IAw4S5vemSLn48f4m...|    5|I wanted to write...|         A & V Nails| Winthrop|Beauty & Spas, Na...|
|BkbqFtF0rK2DS9c5G...|ZChm0C4YeOihxVvy8...|    1|I would not recom...|Boston Lobster Feast|Kissimmee|Restaurants, Seafood|
+---------------

In [49]:
# применим функции
df= df.withColumn('userId',fix_ids_udf(df['user_id']).cast('int'))
df= df.withColumn('businessId',fix_ids_udf(df['business_id']).cast('int'))

df = df.select('userId','businessId',col('stars').alias('rating'),col('text').alias('comment'))

df.show(5)

+---------+----------+------+--------------------+
|   userId|businessId|rating|             comment|
+---------+----------+------+--------------------+
| 93598605|  90943709|     5|This is my second...|
| 60929937|  56296887|     4|The cost isn't ex...|
| 72090464| 101058292|     4|My buddy was visi...|
|118371911| 153166766|     5|I wanted to write...|
|120510584|   4916707|     1|I would not recom...|
+---------+----------+------+--------------------+
only showing top 5 rows



## Tokenizing / Удаление стоп-слов (пример - FLŰGGÅƏNK∂€ČHIŒβØL∫ÊN)

In [60]:
countTokens = udf(lambda words: len(words), IntegerType())

In [None]:
# токенизация текста (предложения в объекты)
tokenizer = Tokenizer(inputCol="comment", outputCol="words")
tokenized = tokenizer.transform(df)

# посмотрим результат
tokenized.select("comment", "words").withColumn("tokens", countTokens(col("words"))).show(5)

In [62]:
# удалить стоп-слова
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover.transform(tokenized)\
       .select('comment','words','filtered')\
       .withColumn("tokens", countTokens(col("filtered"))).show(5)

+--------------------+--------------------+--------------------+------+
|             comment|               words|            filtered|tokens|
+--------------------+--------------------+--------------------+------+
|This is my second...|[this, is, my, se...|[second, time, ge...|    36|
|The cost isn't ex...|[the, cost, isn't...|[cost, expensive,...|    22|
|My buddy was visi...|[my, buddy, was, ...|[buddy, visiting,...|    48|
|I wanted to write...|[i, wanted, to, w...|[wanted, write, r...|   136|
|I would not recom...|[i, would, not, r...|[recommend, place...|    73|
+--------------------+--------------------+--------------------+------+
only showing top 5 rows



In [65]:
filtered = remover.transform(tokenized)\
                  .select('userId','businessId','rating','comment','filtered')\
                  .withColumn("tokens", countTokens(col("filtered")))

In [66]:
filtered.show(5)

+---------+----------+------+--------------------+--------------------+------+
|   userId|businessId|rating|             comment|            filtered|tokens|
+---------+----------+------+--------------------+--------------------+------+
| 93598605|  90943709|     5|This is my second...|[second, time, ge...|    36|
| 60929937|  56296887|     4|The cost isn't ex...|[cost, expensive,...|    22|
| 72090464| 101058292|     4|My buddy was visi...|[buddy, visiting,...|    48|
|118371911| 153166766|     5|I wanted to write...|[wanted, write, r...|   136|
|120510584|   4916707|     1|I would not recom...|[recommend, place...|    73|
+---------+----------+------+--------------------+--------------------+------+
only showing top 5 rows



## создаем матрицу слов и описания

In [67]:
# метод для хеширования
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

#hashingTF - создание фичей из слов
tf = hashingTF.transform(filtered)

# можно применить TF-IDF, а не HashingTF
idf = idf.fit(tf)
tfidf = idf.transform(tf)

In [68]:
data = tfidf.select('userId','businessId','comment','filtered','features',col('rating').alias('label'))
data.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+
|   userId|businessId|             comment|            filtered|            features|label|
+---------+----------+--------------------+--------------------+--------------------+-----+
| 93598605|  90943709|This is my second...|[second, time, ge...|(262144,[1354,225...|    5|
| 60929937|  56296887|The cost isn't ex...|[cost, expensive,...|(262144,[2306,193...|    4|
| 72090464| 101058292|My buddy was visi...|[buddy, visiting,...|(262144,[614,3679...|    4|
|118371911| 153166766|I wanted to write...|[wanted, write, r...|(262144,[991,2437...|    5|
|120510584|   4916707|I would not recom...|[recommend, place...|(262144,[1865,240...|    1|
+---------+----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [69]:
# проставляем метки для обучения
def c_zero_one(n):
    if n==1:
        return 0
    elif n == 5:
        return 1


conv_zero_one = udf(c_zero_one)

good_bad = data.filter( (col('label') == 1) | (col('label') == 5) ).withColumn('label',conv_zero_one(col('label')).cast('int'))
good_bad.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+
|   userId|businessId|             comment|            filtered|            features|label|
+---------+----------+--------------------+--------------------+--------------------+-----+
| 93598605|  90943709|This is my second...|[second, time, ge...|(262144,[1354,225...|    1|
|118371911| 153166766|I wanted to write...|[wanted, write, r...|(262144,[991,2437...|    1|
|120510584|   4916707|I would not recom...|[recommend, place...|(262144,[1865,240...|    0|
|  3501315|   6492804|Best breakfast in...|[best, breakfast,...|(262144,[329,5383...|    1|
|113030580|  85111127|Worst food ever ....|[worst, food, eve...|(262144,[6330,133...|    0|
+---------+----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [70]:
(training,test) = good_bad.randomSplit([0.80,.20])

In [71]:
# применяем модель NvB
# https://ru.wikipedia.org/wiki/%D0%9D%D0%B0%D0%B8%D0%B2%D0%BD%D1%8B%D0%B9_%D0%B1%D0%B0%D0%B9%D0%B5%D1%81%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9_%D0%BA%D0%BB%D0%B0%D1%81%D1%81%D0%B8%D1%84%D0%B8%D0%BA%D0%B0%D1%82%D0%BE%D1%80
nb = NaiveBayes(smoothing=1)
model = nb.fit(training)

# Pred

## Сделаем предикт по рейтингам

In [72]:
# predict
predictions = model.transform(test)

In [73]:
# обзор класса - 0
predictions.filter(predictions['prediction'] == 0).select('comment','filtered','label','prediction').show(10)

+-------+--------+-----+----------+
|comment|filtered|label|prediction|
+-------+--------+-----+----------+
+-------+--------+-----+----------+



In [74]:
# обзор класса - 1
predictions.filter(predictions['prediction'] == 1).select('comment','filtered','label','prediction').show(10)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|I would not recom...|[recommend, place...|    0|       1.0|
|I was in the area...|[area, decided, c...|    0|       1.0|
|Zero Stars.

Just...|[zero, stars., , ...|    0|       1.0|
|The worst Chinese...|[worst, chinese, ...|    0|       1.0|
|This is a five-st...|[five-star, resta...|    1|       1.0|
|I have never writ...|[never, written, ...|    0|       1.0|
|Wish I could give...|[wish, give, zero...|    0|       1.0|
|John was quick to...|[john, quick, ret...|    1|       1.0|
|Since discovering...|[since, discoveri...|    1|       1.0|
|I haven't made it...|[made, yoga, clas...|    1|       1.0|
+--------------------+--------------------+-----+----------+
only showing top 10 rows



In [75]:
# класс 1, только комментарий 
predictions.filter(predictions['prediction'] == 1).select('comment').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [76]:
# оценка качества
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.18796992481203006

### precision, recall, f1score

In [77]:
metrics_rdd = predictions.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [78]:
metrics = MulticlassMetrics(metrics_rdd)

#### precision, recall, f1score для класса 1

In [84]:
metrics.precision(1.0)

1.0

In [85]:
metrics.recall(1.0)

0.35714285714285715

In [86]:
metrics.fMeasure(1.0)

0.5263157894736842

### Pred на всех данных с установкой порогового значения (threshold)

In [87]:
(sample,spill) = data.randomSplit([0.20,.80])

In [88]:
sample.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+
|   userId|businessId|             comment|            filtered|            features|label|
+---------+----------+--------------------+--------------------+--------------------+-----+
|120510584|   4916707|I would not recom...|[recommend, place...|(262144,[1865,240...|    1|
|124636930| 147898287|The first time I ...|[first, time, san...|(262144,[3928,504...|    1|
| 81155808|  84538770|I enjoyed the pho...|[enjoyed, pho,, s...|(262144,[5537,912...|    3|
| 90261353| 111957329|First, working wi...|[first,, working,...|(262144,[2437,576...|    5|
| 69138410|  44510514|Delicious ice cre...|[delicious, ice, ...|(262144,[8618,153...|    5|
+---------+----------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [89]:
avg_rating = sample.groupBy('userId').agg(avg(col('label')).alias('avg-user-rating'))
avg_rating.show(5)

+---------+---------------+
|   userId|avg-user-rating|
+---------+---------------+
| 16372514|            1.0|
| 90261353|            5.0|
|120510584|            1.0|
|148775291|            5.0|
|  1870041|            5.0|
+---------+---------------+
only showing top 5 rows



In [90]:
sample_test = sample.join(avg_rating,'userId')
sample_test.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+---------------+
|   userId|businessId|             comment|            filtered|            features|label|avg-user-rating|
+---------+----------+--------------------+--------------------+--------------------+-----+---------------+
| 16372514|  98125878|We asked them abo...|[asked, adding, g...|(262144,[2437,361...|    1|            1.0|
| 90261353| 111957329|First, working wi...|[first,, working,...|(262144,[2437,576...|    5|            5.0|
|120510584|   4916707|I would not recom...|[recommend, place...|(262144,[1865,240...|    1|            1.0|
|148775291|  20722863|It's a pleasure d...|[pleasure, busine...|(262144,[43756,92...|    5|            5.0|
|  1870041|  42372853|I loved this plac...|[loved, place!, f...|(262144,[3228,136...|    5|            5.0|
+---------+----------+--------------------+--------------------+--------------------+-----+---------------+
only showing top 5 rows



In [33]:
sample_test.show(50)

+-------+----------+--------------------+--------------------+--------------------+-----+------------------+
| userId|businessId|             comment|            filtered|            features|label|   avg-user-rating|
+-------+----------+--------------------+--------------------+--------------------+-----+------------------+
|  26583| 123465947|Airy and colorful...|[airy, colorful, ...|(262144,[10879,21...|    4|               4.0|
| 109068| 130676348|Want a loud, obno...|[want, loud,, obn...|(262144,[28172,50...|    2|               2.0|
| 687716|  45848465|Great food at gre...|[great, food, gre...|(262144,[6979,223...|    5|               5.0|
| 694746| 102984270|Do the price sear...|[price, search, b...|(262144,[40861,43...|    1|               1.0|
| 922409|  12790888|Sherri Did a real...|[sherri, really, ...|(262144,[14,54961...|    5|               4.0|
| 922409|  66199679|Very good filet m...|[good, filet, mig...|(262144,[1536,420...|    3|               4.0|
|1001043|  39216299

In [91]:
sample_labeled = sample_test.withColumn('label', when(col('label') < col('avg-user-rating') , 0 ).otherwise(1)  )

In [92]:
sample_labeled.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+---------------+
|   userId|businessId|             comment|            filtered|            features|label|avg-user-rating|
+---------+----------+--------------------+--------------------+--------------------+-----+---------------+
| 16372514|  98125878|We asked them abo...|[asked, adding, g...|(262144,[2437,361...|    1|            1.0|
| 90261353| 111957329|First, working wi...|[first,, working,...|(262144,[2437,576...|    1|            5.0|
|120510584|   4916707|I would not recom...|[recommend, place...|(262144,[1865,240...|    1|            1.0|
|148775291|  20722863|It's a pleasure d...|[pleasure, busine...|(262144,[43756,92...|    1|            5.0|
|  1870041|  42372853|I loved this plac...|[loved, place!, f...|(262144,[3228,136...|    1|            5.0|
+---------+----------+--------------------+--------------------+--------------------+-----+---------------+
only showing top 5 rows



In [93]:
predictions_sample = model.transform(sample_labeled)

In [94]:
predictions_sample.show(5)

+---------+----------+--------------------+--------------------+--------------------+-----+---------------+--------------------+--------------------+----------+
|   userId|businessId|             comment|            filtered|            features|label|avg-user-rating|       rawPrediction|         probability|prediction|
+---------+----------+--------------------+--------------------+--------------------+-----+---------------+--------------------+--------------------+----------+
| 16372514|  98125878|We asked them abo...|[asked, adding, g...|(262144,[2437,361...|    1|            1.0|[-1553.2947899965...|[1.82960707547790...|       1.0|
| 90261353| 111957329|First, working wi...|[first,, working,...|(262144,[2437,576...|    1|            5.0|[-3429.3826304091...|[3.28520852870944...|       1.0|
|120510584|   4916707|I would not recom...|[recommend, place...|(262144,[1865,240...|    1|            1.0|[-2514.6877449072...|[1.34650103563151...|       1.0|
|148775291|  20722863|It's a pleas

In [95]:
predictions_sample.filter(predictions_sample['prediction'] == 0).select('comment','filtered','label','prediction').show(10)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|The first time I ...|[first, time, san...|    1|       0.0|
+--------------------+--------------------+-----+----------+



In [96]:
evaluator.evaluate(predictions_sample)

0.975609756097561

### precision, recall, f1score

In [97]:
metrics_rdd1 = predictions_sample.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [98]:
metrics1 = MulticlassMetrics(metrics_rdd1)

#### precision, recall, f1score для класса 1

In [99]:
metrics1.precision(1.0)

0.9523809523809523

In [100]:
metrics1.recall(1.0)

1.0

In [101]:
metrics1.fMeasure(1.0)

0.975609756097561

### Pred рейтингов с порогом 2.5 порог

In [102]:
sample_labeled2 = sample_test.withColumn('label', when(col('label') < 2.5 , 0 ).otherwise(1)  )

In [103]:
predictions_sample2 = model.transform(sample_labeled2)

In [104]:
evaluator.evaluate(predictions_sample2)

0.7566137566137567

In [105]:
predictions_sample2.filter(predictions_sample2['prediction'] == 1).select('comment','filtered','label','prediction').show(50)

+--------------------+--------------------+-----+----------+
|             comment|            filtered|label|prediction|
+--------------------+--------------------+-----+----------+
|We asked them abo...|[asked, adding, g...|    0|       1.0|
|First, working wi...|[first,, working,...|    1|       1.0|
|I would not recom...|[recommend, place...|    0|       1.0|
|It's a pleasure d...|[pleasure, busine...|    1|       1.0|
|I loved this plac...|[loved, place!, f...|    1|       1.0|
|Never met a Cactu...|[never, met, cact...|    1|       1.0|
|Hate to say this ...|[hate, say, leave...|    0|       1.0|
|The food was good...|[food, good., sea...|    0|       1.0|
|Since discovering...|[since, discoveri...|    1|       1.0|
|Literally not a t...|[literally, thing...|    1|       1.0|
|I enjoyed the pho...|[enjoyed, pho,, s...|    1|       1.0|
|Love Jimmy John's...|[love, jimmy, joh...|    1|       1.0|
|This is my favori...|[favorite, mexica...|    1|       1.0|
|Really really goo...|[r

In [106]:
predictions.filter(predictions['prediction'] == 0).select('comment','filtered','label','prediction').show(50)

+-------+--------+-----+----------+
|comment|filtered|label|prediction|
+-------+--------+-----+----------+
+-------+--------+-----+----------+



### precision, recall, f1score

In [107]:
metrics_rdd2 = predictions_sample2.select(col('label').cast('float'),col('prediction').cast('float')).rdd

In [108]:
metrics2 = MulticlassMetrics(metrics_rdd2)

#### precision, recall, f1score для класса 1

In [109]:
metrics2.precision(1.0)

1.0

In [110]:
metrics2.recall(1.0)

0.8

In [111]:
metrics2.fMeasure(1.0)

0.888888888888889