### step one : imports and read in data
    

In [1]:
from pyspark.sql import SparkSession

import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_241.jdk/Contents/Home'



In [2]:
spark = SparkSession.builder.appName("Basics").getOrCreate()
amazon = spark.read.csv('../amazon_reviews_multilingual_US_v1_00.tsv', sep='\t', header=True, inferSchema=True)

### step two : explore your data

In [3]:
#Let's look at the datatypes of each column
amazon.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)



In [4]:
#What are my columns?
amazon.columns

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [5]:
# What is in my dataset
amazon.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   53096384| R63J84G1LOX6R|1563890119|     763187671|The Sandman Vol. ...|           Books|          4|            0|          1|   N|                N|ignore the review...|this is the first...|1995-08-13 00:00:00|
|         US|   53096399|R1BALOA11Z06MT|1559947608|     381720534|The 22 Immutable ...| 

In [None]:
# What happens if we don't include header?
amazon_zonder_header = spark.read.csv('../amazon_reviews_multilingual_US_v1_00.tsv', sep='\t', header=False, inferSchema=True)
amazon_zonder_header.show()

In [6]:
# We know from the project description that we only need the 
# customer_id, review_date, product_category and star_rating columns, 
# so we'll drop all the other columns.
amazon = amazon.select('customer_id', 'review_date', 'product_category', 'star_rating')
amazon.show()

+-----------+-------------------+----------------+-----------+
|customer_id|        review_date|product_category|star_rating|
+-----------+-------------------+----------------+-----------+
|   53096384|1995-08-13 00:00:00|           Books|          4|
|   53096399|1995-08-17 00:00:00|           Books|          4|
|   53096332|1995-08-30 00:00:00|           Books|          5|
|   53096335|1995-09-11 00:00:00|           Books|          5|
|   51747709|1995-10-17 00:00:00|           Books|          5|
|   53095881|1995-11-02 00:00:00|           Books|          5|
|   53096485|1995-11-11 00:00:00|           Music|          5|
|   53096485|1995-11-11 00:00:00|           Video|          5|
|   53095748|1995-11-15 00:00:00|           Books|          5|
|   53071277|1996-05-12 00:00:00|           Books|          5|
|   53092169|1996-05-15 00:00:00|           Books|          5|
|   53091057|1996-05-17 00:00:00|           Books|          5|
|   53092180|1996-05-17 00:00:00|           Books|     

In [7]:
#Let's get some quick summary statistics
amazon.describe().show()


+-------+--------------------+----------------+------------------+
|summary|         customer_id|product_category|       star_rating|
+-------+--------------------+----------------+------------------+
|  count|             6931166|         6931165|           6931165|
|   mean|2.9174252598883938E7|            null| 4.306755934969085|
| stddev|1.5654668366694963E7|            null|1.1461060185900125|
|    min|               10001|         Apparel|                 1|
|    max|            53096589|        Wireless|                 5|
+-------+--------------------+----------------+------------------+



In [8]:
# How many rows are there eigenlijk in our dataframe?
amazon.count()

6931166

In [9]:
#How many Null values do we have per column? 
for col in amazon.columns:
    print(col, "with null values: ", amazon.filter(amazon[col].isNull()).count())

customer_id with null values:  0
review_date with null values:  615
product_category with null values:  1
star_rating with null values:  1
