In [None]:
#connect drive to Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Download Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
#Install Apache Spark with Hadoop
#!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -P drive/MyDrive/Colab_Notebooks/Amazon_reviews/Resources


In [None]:
#unzip
!tar xf drive/MyDrive/Colab_Notebooks/Amazon_reviews/Resources/spark-3.2.0-bin-hadoop3.2.tgz

In [None]:
!pip install -q findspark

In [None]:
#set PySpark enviroment
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" 
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

In [None]:
#initate PySPark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
#create a Spark session
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()
#print the SparkSession variable.
spark   

In [None]:
#import
from pyspark.sql.functions import input_file_name
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DateType, FloatType

In [None]:
#test import a file
path = 'drive/MyDrive/Colab_Notebooks/Amazon_reviews/data/AMAZON_FASHION_5.json.gz'
df_test = spark.read.json(path)
df_test.printSchema()

root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: struct (nullable = true)
 |    |-- Color:: string (nullable = true)
 |    |-- Size Name:: string (nullable = true)
 |    |-- Size:: string (nullable = true)
 |    |-- Style:: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- vote: string (nullable = true)



In [None]:
df_test.show(5)

+----------+-----+-------+--------------------+----------+-------------+------------+--------------------+----------+--------------+--------+----+
|      asin|image|overall|          reviewText|reviewTime|   reviewerID|reviewerName|               style|   summary|unixReviewTime|verified|vote|
+----------+-----+-------+--------------------+----------+-------------+------------+--------------------+----------+--------------+--------+----+
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|{ Blue/Orange, nu...|Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|{ Black (37467610...|Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|{ Blue/Gray Logo,...|Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|{ Blue (37867638-...|Five Stars| 

In [None]:
df_test.describe().show()

+-------+----------+-----------------+--------------------+----------+--------------+--------------------+--------------------+--------------------+-----------------+
|summary|      asin|          overall|          reviewText|reviewTime|    reviewerID|        reviewerName|             summary|      unixReviewTime|             vote|
+-------+----------+-----------------+--------------------+----------+--------------+--------------------+--------------------+--------------------+-----------------+
|  count|      3176|             3176|                3160|      3176|          3176|                3176|                3176|                3176|              297|
|   mean|      null|4.404282115869018|                null|      null|          null|                null|                null|  1.49221080906801E9|5.858585858585859|
| stddev|      null|1.034963845574042|                null|      null|          null|                null|                null|2.3967880580666605E7|6.317623517776833

In [None]:
#get a list of all file names
file_name = os.listdir('drive/MyDrive/Colab_Notebooks/Amazon_reviews/data/')

#define schema
schema = StructType([
            StructField('overall',FloatType(),True),
            StructField('reviewText',StringType(),True),
            StructField('reviewerName',StringType(),True),
            StructField('summary',StringType(),True),
            StructField('verified',BooleanType(),True),
            StructField('vote',StringType(),True),
            StructField('categories',StringType(),False)])

In [None]:
#load all data
df = spark.createDataFrame([], schema)
df = df.withColumn('categories',lit(0))
for name in file_name:
  path = 'drive/MyDrive/Colab_Notebooks/Amazon_reviews/data/' + name
  #print('start: ', name[:-10])
  df_temp = spark.read.json(path, schema)
  df_temp = df_temp.withColumn('categories',lit(name[:-10]))
  df = df.union(df_temp)
  #print('done: ', name[:-10])
df = df.withColumn("vote",df.vote.cast('float'))

In [None]:
df.show(5)

+-------+--------------------+------------+--------------------+--------+----+----------+
|overall|          reviewText|reviewerName|             summary|verified|vote|categories|
+-------+--------------------+------------+--------------------+--------+----+----------+
|    4.0|I've been using D...|   WB Halper|A solid overview ...|   false|null|  Software|
|    4.0|The demo is done ...|      Grimmy|        A good value|   false|null|  Software|
|    5.0|If you've been wa...|   D. Fowler|This is excellent...|   false| 3.0|  Software|
|    5.0|I've been creatin...|Bryan Newman|A Fantastic Overv...|   false|null|  Software|
|    5.0|I decided (after ...|  Al Swanson|Excellent Tutorials!|   false|null|  Software|
+-------+--------------------+------------+--------------------+--------+----+----------+
only showing top 5 rows



In [None]:
df.printSchema()

root
 |-- overall: float (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- vote: float (nullable = true)
 |-- categories: string (nullable = false)



In [None]:
df = df.dropDuplicates()

In [None]:
df.describe().show()

+-------+------------------+--------------------+-------------+----------+------------------+--------------+
|summary|           overall|          reviewText| reviewerName|   summary|              vote|    categories|
+-------+------------------+--------------------+-------------+----------+------------------+--------------+
|  count|          67722108|            67709455|     67717462|  67703543|          11328418|      67722108|
|   mean| 4.328317910600184|            Infinity|          NaN|       NaN| 8.487419867451925|          null|
| stddev|1.0879490068567435|                 NaN|          NaN|       NaN|23.471578550596945|          null|
|    min|               0.0|            \n\n\n\n|       Rae |          |               2.0|AMAZON_FASHION|
|    max|               5.0|When I pick up...|~~~~~~~~~~~~~|Quiet Fan|             999.0|   Video_Games|
+-------+------------------+--------------------+-------------+----------+------------------+--------------+



In [None]:
#df.select("categories").distinct().show()

In [None]:
#since this is a nlp project, entries with reviewText and summary that are missing is not valuable to us, drop them
df = df.dropna(subset=['reviewText', 'summary'])

#Save Data

In [None]:
df.write.format('json').mode('overwrite').save("drive/MyDrive/Colab_Notebooks/Amazon_reviews/preprocess_data")

In [None]:
df.printSchema()

root
 |-- overall: float (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- vote: float (nullable = true)
 |-- categories: string (nullable = false)



In [None]:
df.show(5)

+-------+--------------------+-----------------+--------------------+--------+----+----------+
|overall|          reviewText|     reviewerName|             summary|verified|vote|categories|
+-------+--------------------+-----------------+--------------------+--------+----+----------+
|    5.0|As someone who ha...|Loves Books in MD|Learn Adobe Photo...|   false|null|  Software|
|    4.0|I've been running...|        Mindcrime|Great product, bu...|   false|14.0|  Software|
|    1.0|December 13, 2008...|      James Smith|Amazon, PBJWORLD ...|   false| 5.0|  Software|
|    2.0|I have been a Qui...|  Lance_big_daddy|Intuit has lost i...|   false|31.0|  Software|
|    1.0|This is by far th...|           Deimos|        Garbage.....|   false|null|  Software|
+-------+--------------------+-----------------+--------------------+--------+----+----------+
only showing top 5 rows

