In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky


In [None]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
spark = SparkSession \
            .builder \
            .master("local[*]")\
            .appName("New-Spark") \
            .config("spark.memory.fraction", 0.8) \
            .config("spark.executor.memory", "10g") \
            .config("spark.driver.memory", "10g")\
            .config("spark.sql.shuffle.partitions" , "800") \
            .config("spark.memory.offHeap.enabled",'true')\
            .config("spark.memory.offHeap.size","10g")\
            .getOrCreate()
spark

- Thông thường có 2 cách để run file excel cho pyspark là: **pandas** và **com.crealytics.spark.excel**
- Cả 2 cách đều sử dụng paralell computation nên tốc độ không khác nhau đáng kể.

In [None]:
file_name = "data/womens-ecommerce-clothing-reviews/Womens_Clothing_E_Commerce_Reviews.xlsx"
data = pd.read_excel(file_name, sheet_name='Reviews', index_col=0, engine='openpyxl')
data.head(5)

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23481 entries, 0 to 23480
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing ID              23481 non-null  int64 
 1   Age                      23481 non-null  int64 
 2   Title                    19671 non-null  object
 3   Review Text              22636 non-null  object
 4   Rating                   23481 non-null  int64 
 5   Recommended IND          23481 non-null  int64 
 6   Positive Feedback Count  23481 non-null  int64 
 7   Division Name            23467 non-null  object
 8   Department Name          23467 non-null  object
 9   Class Name               23467 non-null  object
dtypes: int64(5), object(5)
memory usage: 2.0+ MB


- Các biến NLP (text) có null, tuy nhiên có thể giữ lại vì đặc tính thật sự là người dùng có thể rating mà có thể không cần viết reiview, title
- Các biến Division Name, Department Name, Class Name có các null value có thể bỏ đi vì số lượng là khá ít so với tổng 23.481 mẫu.

In [None]:
data = data.dropna(subset=['Division Name', 'Department Name', 'Class Name'])

In [None]:
# Thông qua data.info và dtype ta tạo schema cho pyspark dataframe
schema = StructType([StructField("Clothing ID", IntegerType(), False), 
                     StructField("Age", IntegerType(), False),
                     StructField("Title", StringType(), True),
                     StructField("Review Text", StringType(), True),
                     StructField("Rating", IntegerType(), False),
                     StructField("Recommended IND", IntegerType(), False),
                     StructField("Positive Feedback Count", IntegerType(), False),
                     StructField("Division Name", StringType(), False),
                     StructField("Department Name", StringType(), False),
                     StructField("Class Name", StringType(), False),
])
df = spark.createDataFrame(data, schema=schema)
df.show(5)

+-----------+---+--------------------+--------------------+------+---------------+-----------------------+--------------+---------------+----------+
|Clothing ID|Age|               Title|         Review Text|Rating|Recommended IND|Positive Feedback Count| Division Name|Department Name|Class Name|
+-----------+---+--------------------+--------------------+------+---------------+-----------------------+--------------+---------------+----------+
|        767| 33|                 NaN|Absolutely wonder...|     4|              1|                      0|     Initmates|       Intimate| Intimates|
|       1080| 34|                 NaN|Love this dress! ...|     5|              1|                      4|       General|        Dresses|   Dresses|
|       1077| 60|Some major design...|I had such high h...|     3|              0|                      0|       General|        Dresses|   Dresses|
|       1049| 50|    My favorite buy!|I love, love, lov...|     5|              1|                      0|

Kiểm tra số lượng mẫu, các biến categoric

In [None]:
df.printSchema()

root
 |-- Clothing ID: integer (nullable = false)
 |-- Age: integer (nullable = false)
 |-- Title: string (nullable = true)
 |-- Review Text: string (nullable = true)
 |-- Rating: integer (nullable = false)
 |-- Recommended IND: integer (nullable = false)
 |-- Positive Feedback Count: integer (nullable = false)
 |-- Division Name: string (nullable = false)
 |-- Department Name: string (nullable = false)
 |-- Class Name: string (nullable = false)



In [None]:
df.count()

23467

In [None]:
df.select('Clothing ID').distinct().count()

1199

In [None]:
df.select('Recommended IND').distinct().show()

+---------------+
|Recommended IND|
+---------------+
|              0|
|              1|
+---------------+



In [None]:
df.select('Division Name').distinct().show()

+--------------+
| Division Name|
+--------------+
|       General|
|     Initmates|
|General Petite|
+--------------+



In [None]:
df.select('Department Name').distinct().show()

+---------------+
|Department Name|
+---------------+
|        Jackets|
|          Trend|
|       Intimate|
|        Dresses|
|           Tops|
|        Bottoms|
+---------------+



In [None]:
df.select('Class Name').distinct().count()

20

Kiểm tra biến label (rating), xem có imbalance hay không?

In [None]:
df.groupby('Rating').count().show()

+------+-----+
|Rating|count|
+------+-----+
|     2| 1564|
|     3| 2870|
|     5|13116|
|     1|  841|
|     4| 5076|
+------+-----+



Nhận xét:
- Bài toán đặt ra thuộc nhóm Classification - Rating Prediction.
- Có imbalanced trong biến rating, nên cần tạo thêm "weight" để cân bằng cho bài toán.
- Các biến numeric: Age, Positive Feedback Count.
- Các biến categoric: Clothing ID, Recommended IND, Division Name, Department Name, Class Name.
- Các biến NLP: Title, Review Text
- Các biến input khá phong phú, đồng thời số lượng category trong các biến categoric là khá nhiều, thêm vào đó có 2 biến NLP mà nếu convert thành vector thì số lượng features sẽ khá lớn nếu Convert tất cả thành one-hot encoder. 
- Khi sử dụng tokenizer để xử lý text, thì function đã tự convert lowercase, chỉ cần pre-processing bỏ bớt các ký tự đặc biệt là đủ.

In [None]:
total_count = df.count()
weight_df = df.groupby('Rating').count()
weight_df = weight_df.withColumn('current_weight', col('count')/total_count)
weight_df = weight_df.withColumn('weight', 0.2/col('current_weight'))
weight_df.show()

+------+-----+-------------------+-------------------+
|Rating|count|     current_weight|             weight|
+------+-----+-------------------+-------------------+
|     2| 1564|0.06664678058550305| 3.0008951406649618|
|     3| 2870|0.12229939915626198|  1.635331010452962|
|     5|13116| 0.5589125154472238|0.35783775541323576|
|     1|  841| 0.0358375591255806|  5.580737217598099|
|     4| 5076| 0.2163037456854306| 0.9246256895193067|
+------+-----+-------------------+-------------------+



In [None]:
df = df.join(weight_df, on='Rating', how='left_outer')

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, \
                                Tokenizer, StopWordsRemover, CountVectorizer, IDF, \
                                StandardScaler
from pyspark.ml.pipeline import Pipeline

df = df.withColumnRenamed('Rating', 'label')
df = df.withColumn('len_title', length(col('Title')))
df = df.withColumn('len_review', length(col('Review Text')))
df = df.withColumn('text', concat(col('Title'), lit(' '), col('Review Text')))

indexer_cloth_id = StringIndexer(inputCol='Clothing ID', outputCol='cloth_idx')
indexer_division_id = StringIndexer(inputCol='Division Name', outputCol='division_idx')
indexer_department_id = StringIndexer(inputCol='Department Name', outputCol='department_idx')
indexer_class_id = StringIndexer(inputCol='Class Name', outputCol='class_idx')

onehot = OneHotEncoder(inputCols=['cloth_idx', 'division_idx', 'department_idx', 'class_idx'],\
                       outputCols=['cloth_dummy', 'division_dummy', 'department_dummy', 'class_dummy'])

tokenizer_title = Tokenizer(inputCol='Title', outputCol='title_words')
tokenizer_review = Tokenizer(inputCol='Review Text', outputCol='review_words')
tokenizer_text = Tokenizer(inputCol='text', outputCol='text_words')

remover_title = StopWordsRemover(inputCol='title_words', outputCol='title_filtered')
remover_review = StopWordsRemover(inputCol='review_words', outputCol='review_filtered')
remover_text = StopWordsRemover(inputCol='text_words', outputCol='text_filtered')

count_vec_title = CountVectorizer(inputCol='title_filtered', outputCol='title_count', vocabSize=1000)
count_vec_review = CountVectorizer(inputCol='review_filtered', outputCol='review_count', vocabSize=1000)
count_vec_text = CountVectorizer(inputCol='text_filtered', outputCol='text_count', vocabSize=1000)

idf_title = IDF(inputCol='title_count', outputCol='title_idf')
idf_review = IDF(inputCol='review_count', outputCol='review_idf')
idf_text = IDF(inputCol='text_count', outputCol='text_idf')

vector_assembler = VectorAssembler(inputCols=['Age', 'Positive Feedback Count',\
                                              'cloth_dummy', 'Recommended IND', 'division_dummy',\
                                              'department_dummy', 'class_dummy', \
                                              'len_title', 'len_review', \
                                              'title_idf', 'review_idf', \
                                              'text_idf'], outputCol='non_scale_features')

scaler = StandardScaler(inputCol="non_scale_features", outputCol="features")

pre_process_pipeline = Pipeline(stages=[indexer_cloth_id, \
                                        indexer_division_id, indexer_department_id, \
                                        indexer_class_id, onehot, \
                                        tokenizer_title, tokenizer_review, \
                                        remover_title, remover_review, \
                                        count_vec_title, count_vec_review, \
                                        idf_title, idf_review, \
                                        tokenizer_text, remover_text, count_vec_text, idf_text, \
                                        vector_assembler, scaler])

pre_process_pipeline_fit = pre_process_pipeline.fit(df)

final_df = pre_process_pipeline_fit.transform(df)
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)
final_df.select('label', 'features').show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|(4229,[0,1,19,120...|
|    2|(4229,[0,1,16,120...|
|    2|(4229,[0,74,1201,...|
|    2|(4229,[0,74,1201,...|
|    2|(4229,[0,1,474,12...|
+-----+--------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

list_model = [\
              ('Logistic Regression', LogisticRegression(weightCol='weight')),
            #   ('Decision Tree', DecisionTreeClassifier(seed=1, weightCol='weight')), 
              ('Random Forest', RandomForestClassifier(seed=1, weightCol='weight')), 
]

for model_name, model in list_model:
    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))

    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)
    evaluator = MulticlassClassificationEvaluator()
    print('-'*30)
    print("\033[1m" + model_name + "\033[0m")
    print('')
    print('  Accuracy : {:.4f}'.format(metrics.accuracy))
    print('  AUC      : {:.4f}'.format(evaluator.evaluate(predictions)))
    print('')

------------------------------
[1mLogistic Regression[0m

  Accuracy : 0.5706
  AUC      : 0.5861

------------------------------
[1mRandom Forest[0m

  Accuracy : 0.5209
  AUC      : 0.5298



In [None]:
def tunning_model(model, model_name, param, metric):
    import time
    from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    tic = time.time()
    tvs = TrainValidationSplit(estimator=model, 
                            estimatorParamMaps=param,
                            evaluator = MulticlassClassificationEvaluator(metricName=metric), 
                            trainRatio=0.8,
    )

    best_model = tvs.fit(train_df)
    predictions = best_model.transform(test_df)
    toc = time.time()
    predictions = predictions.withColumn('label', predictions.label.cast(DoubleType()))
    prediction_and_label = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(prediction_and_label.rdd)

    print("\033[1m" + model_name + "\033[0m")
    print('')
    print('  Accuracy \t\t: {:.4f}'.format(metrics.accuracy))
    print('  Total time tunning model: {:.2f} seconds'.format(toc-tic))
    
    return best_model.getEstimator()

In [None]:
model_logis = LogisticRegression(weightCol='weight')

param_logis = ParamGridBuilder()\
            .addGrid(model_logis.maxIter, [100])\
            .addGrid(model_logis.regParam, [0, 0.1, 0.01])\
            .addGrid(model_logis.elasticNetParam, [0, 0.1, 0.01])\
            .build()

best_model_logis = tunning_model(model_logis, 'Logistic Regression', param_logis, 'accuracy')

[1mLogistic Regression[0m

  Accuracy 		: 0.6163
  Total time tunning model: 18893.50 seconds


In [None]:
model_rdn_forest = RandomForestClassifier(seed=1, weightCol='weight')

param_rdn_forest = ParamGridBuilder()\
            .addGrid(model_rdn_forest.numTrees, [100])\
            .addGrid(model_rdn_forest.maxDepth, [2, 3, 6, 7])\
            .addGrid(model_rdn_forest.featureSubsetStrategy, ['sqrt', 'log2', 'auto'])\
            .build()

best_model_rdn_forest = tunning_model(model_rdn_forest, 'Random Forest', param_rdn_forest, 'accuracy')

Test model trên sheet new_reviews, xem thử model có predict hợp lý hay không.

In [None]:
data_test = pd.read_excel(file_name, sheet_name='new_reviews', index_col=0)

schema_test = StructType([StructField("Clothing ID", IntegerType(), False), 
                     StructField("Age", IntegerType(), False),
                     StructField("Title", StringType(), False),
                     StructField("Review Text", StringType(), False),
                     StructField("Recommended IND", IntegerType(), False),
                     StructField("Positive Feedback Count", IntegerType(), False),
                     StructField("Division Name", StringType(), False),
                     StructField("Department Name", StringType(), False),
                     StructField("Class Name", StringType(), False),
])
df_to_test = spark.createDataFrame(data_test, schema=schema_test)
df_to_test.show(5)

+-----------+---+--------------------+--------------------+---------------+-----------------------+-------------+---------------+----------+
|Clothing ID|Age|               Title|         Review Text|Recommended IND|Positive Feedback Count|Division Name|Department Name|Class Name|
+-----------+---+--------------------+--------------------+---------------+-----------------------+-------------+---------------+----------+
|       1077| 53|Dress looks like ...|Dress runs small ...|              0|                     14|      General|        Dresses|   Dresses|
|        862| 66|            Cute top|Nice top. armhole...|              1|                      2|      General|           Tops|     Knits|
|       1080| 31|        Underwhelmed|Was really excite...|              0|                      1|      General|        Dresses|   Dresses|
|        936| 35|  Absolutely perfect|If you are going ...|              0|                      9|      General|           Tops|  Sweaters|
|        872|

In [None]:
df_to_test = df_to_test.withColumn('len_title', length(col('Title')))
df_to_test = df_to_test.withColumn('len_review', length(col('Review Text')))
df_to_test = df_to_test.withColumn('text', concat(col('Title'), lit(' '), col('Review Text')))

final_df_to_test = pre_process_pipeline_fit.transform(df_to_test)

In [None]:
best_model_logis_fit = best_model_logis.fit(final_df)
predictions_on_test = best_model_logis_fit.transform(final_df_to_test)
predictions_on_test.select('Clothing ID', 'Title', 'Review Text', 'Recommended IND', 'Positive Feedback Count', 'prediction').show(truncate=False)

+-----------+--------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+-----------------------+----------+
|Clothing ID|Title                                       |Review Text                                                                                                                                                                                                                                                                                                                                                                                    

Nhận xét:
- Có thể thấy được model dự đoán rating khá chính xác (theo quan điểm người viết)
    - SP 1: hầu như review là chê sản phẩm rất nhiều, không recommend -> rating 1 là hợp lý.
    - SP 2: review khen khá nhiều và cuối cùng recommend sản phẩm -> rating 4 là hợp lý.
    - SP 3: review chê khá nhiều, chất liệu rẻ tiền cuối cùng 'very disappointed and will be returning', không recommend -> rating 2 là hợp lý.
    - SP 4: title là "Absolutely perfect" nhưng bình luận lại khá trung tính, cuối cùng không thích sản phẩm "this one did not do it for me", "look cheap rather than trendy", không recommend sản phẩm -> rating 3 cũng hợp lý.
    - SP 5: review khen sản phẩm rất tốt, "i can't wait to pair it with my new white jeans for summer", cuối cùng recommend SP -> rating 5 là hợp lý.
- Model test khá tốt với data mới, mặc dù accuracy không cao lắm.
