## Necessary imports

In [1]:
import os
import sys
import numpy as np
import scipy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
! pwd
! ls -la
! head -n 3 amazon_item_ratings.csv
! tail -n 3 amazon_item_ratings.csv

/home/big/Projeto/ABD-Projeto
total 673860
drwxrwxr-x 8 big big      4096 mai 23 00:42  .
drwxrwxr-x 3 big big      4096 mai 18 20:44  ..
drwxr-xr-x 2 big big      4096 mai 22 20:55  ALSmodel
-rw-rw-r-- 1 big big 689932433 mai 18 20:43  amazon_item_ratings.csv
-rw-rw-r-- 1 big big     17582 mai 23 00:42 'Data Preparation.ipynb'
-rw-rw-r-- 1 big big       680 mai 23 00:20  derby.log
drwxrwxr-x 8 big big      4096 mai 18 20:39  .git
drwxrwxr-x 2 big big      4096 mai 23 00:00  .ipynb_checkpoints
drwxr-xr-x 2 big big      4096 mai 23 00:18  items.parquet
drwxrwxr-x 5 big big      4096 mai 23 00:20  metastore_db
-rw-rw-r-- 1 big big     18564 mai 18 22:55  Pre-Computing.ipynb
-rw-rw-r-- 1 big big        12 mai 17 20:11  README.md
-rw-rw-r-- 1 big big      9741 mai 19 00:22  Results.ipynb
drwxrwxr-x 4 big big      4096 mai 23 00:20  spark-warehouse
A3AF8FFZAZYNE5,0000000078,5.0,1092182400
A2X4DOBWXXTX4A,1615725415,5.0,1327536000
A202HM75ZHSEGJ,B0059XTUB8,5.0,1335830400
A2V1XSFJL9BI3,0687038

## Reading from file

In [4]:
df_items = spark.read.csv("amazon_item_ratings.csv", header=False, inferSchema=True, sep=",") 

Get a fraction of the sample

In [5]:
df_items = df_items.sample(fraction=0.05) #0.05 works

## Multiple checks on structure

In [6]:
df_items.printSchema()
df_items.count()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: integer (nullable = true)



826913

In [7]:
#Check null values
df_items.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_items.columns]).show()

+---+---+---+---+
|_c0|_c1|_c2|_c3|
+---+---+---+---+
|  0|  0|  0|  0|
+---+---+---+---+



In [8]:
df_items.show(10, truncate=False)

+--------------+----------+---+----------+
|_c0           |_c1       |_c2|_c3       |
+--------------+----------+---+----------+
|A3AF8FFZAZYNE5|0000000078|5.0|1092182400|
|AQS1YFK3BU5B0 |B007PQTYIG|5.0|1394236800|
|A18ZANDD9X72AU|0735623872|5.0|1205539200|
|A2BNNT9DX50HF8|030740515X|4.0|1231977600|
|ACYLYFJIKE501 |B003HAL5ZO|1.0|1291766400|
|A2WZL2M468O251|0399536957|3.0|1318636800|
|A22UXNXX4MX0MD|B004HO58UW|2.0|1362009600|
|A1HPDC0DRN1KYU|B003JQLG4Q|4.0|1326758400|
|A1S63VUDE2YJVU|B0041MUB52|4.0|1388102400|
|A1AS5N0JB24OIX|B009XZ9Q1C|5.0|1356825600|
+--------------+----------+---+----------+
only showing top 10 rows



In [9]:
df_items.describe("_c2").show()


+-------+------------------+
|summary|               _c2|
+-------+------------------+
|  count|            826913|
|   mean| 4.162625330597052|
| stddev|1.2614705476479164|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+



# Indexing

In [10]:
indexerItem = StringIndexer(inputCol="_c1", outputCol="Item_Index")
df_items_index_users = indexerItem.fit(df_items).transform(df_items)
df_items_index_users.select("_c1","Item_Index").show(truncate=False)

del df_items

+----------+----------+
|_c1       |Item_Index|
+----------+----------+
|0000000078|109757.0  |
|B007PQTYIG|442985.0  |
|0735623872|145094.0  |
|030740515X|7460.0    |
|B003HAL5ZO|360858.0  |
|0399536957|129965.0  |
|B004HO58UW|41982.0   |
|B003JQLG4Q|362137.0  |
|B0041MUB52|373762.0  |
|B009XZ9Q1C|46018.0   |
|B007ZJ1M9C|447558.0  |
|B003A845OQ|356169.0  |
|0972973052|167851.0  |
|B002P3YRAY|84633.0   |
|0888550081|162628.0  |
|0521697522|136584.0  |
|B0049LUI9O|523.0     |
|B00ATSSQT0|483915.0  |
|B000HDK0DC|8264.0    |
|B00DNUF7KW|5786.0    |
+----------+----------+
only showing top 20 rows



In [11]:
indexerUsers = StringIndexer(inputCol="_c0", outputCol="User_Index")
df_items_indexed = indexerUsers.fit(df_items_index_users).transform(df_items_index_users)
df_items_indexed.select("_c0","User_Index").show(truncate=False)

del df_items_index_users

+--------------+----------+
|_c0           |User_Index|
+--------------+----------+
|A3AF8FFZAZYNE5|464847.0  |
|AQS1YFK3BU5B0 |683735.0  |
|A18ZANDD9X72AU|104574.0  |
|A2BNNT9DX50HF8|294217.0  |
|ACYLYFJIKE501 |615874.0  |
|A2WZL2M468O251|398660.0  |
|A22UXNXX4MX0MD|250799.0  |
|A1HPDC0DRN1KYU|147285.0  |
|A1S63VUDE2YJVU|198773.0  |
|A1AS5N0JB24OIX|113523.0  |
|A10QUCL9MVZKFQ|15119.0   |
|A2AMXP8LOGKR11|289138.0  |
|AAA3K49BH2WZC |602545.0  |
|A1IBLRMHYAIHC0|150247.0  |
|ABSL9JJZHACH7 |610088.0  |
|A1TTCRUKCSIJ68|206748.0  |
|ALXBWHZKR64XU |659803.0  |
|A2QR25WYO60HA6|368289.0  |
|ATN0EAYQ78LUK |697793.0  |
|A2IN9EXJXRR1DA|328534.0  |
+--------------+----------+
only showing top 20 rows



## Cleaning

In [12]:
df_items_indexed.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- Item_Index: double (nullable = false)
 |-- User_Index: double (nullable = false)



In [13]:
df_items_indexed = df_items_indexed.drop("_c0","_c1") 

In [14]:
df_items_indexed.show(10, truncate=False)

+---+----------+----------+----------+
|_c2|_c3       |Item_Index|User_Index|
+---+----------+----------+----------+
|5.0|1092182400|109757.0  |464847.0  |
|5.0|1394236800|442985.0  |683735.0  |
|5.0|1205539200|145094.0  |104574.0  |
|4.0|1231977600|7460.0    |294217.0  |
|1.0|1291766400|360858.0  |615874.0  |
|3.0|1318636800|129965.0  |398660.0  |
|2.0|1362009600|41982.0   |250799.0  |
|4.0|1326758400|362137.0  |147285.0  |
|4.0|1388102400|373762.0  |198773.0  |
|5.0|1356825600|46018.0   |113523.0  |
+---+----------+----------+----------+
only showing top 10 rows



## Store data

In [15]:
output_items = "items.parquet"
df_items_indexed.write.mode("overwrite").parquet(output_items)

In [18]:

df_items_indexed.write.mode("overwrite").saveAsTable("ItemsTable")

AnalysisException: Can not create the managed table('`ItemsTable`'). The associated location('file:/home/big/Desktop/Aulas/Projeto/spark-warehouse/itemstable') already exists.;