In [None]:
import os
import sys
import numpy as np
import scipy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [6]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

In [7]:
MAX_MEMORY = "10g"

spark = SparkSession \
    .builder \
    .appName("Foo") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

In [8]:
! pwd
! ls -la
! head -n 3 amazon_item_ratings.csv
! tail -n 3 amazon_item_ratings.csv

/home/big/Desktop/Aulas/Projeto
total 677344
drwxrwxr-x 7 big big      4096 mai 18 18:46  .
drwxrwxr-x 3 big big      4096 mai 18 15:59  ..
drwxr-xr-x 5 big big      4096 mai 18 17:31  ALSmodel
-rwxrw-rw- 1 big big 689932433 mai 18 15:22  amazon_item_ratings.csv
-rw-rw-r-- 1 big big     35678 mai 18 18:45 'Data Preparation.ipynb'
-rw-rw-r-- 1 big big   3568168 mai 18 18:45  derby.log
drwxrwxr-x 2 big big      4096 mai 18 17:51  .ipynb_checkpoints
drwxr-xr-x 2 big big      4096 mai 18 17:31  items.parquet
drwxrwxr-x 5 big big      4096 mai 18 18:45  metastore_db
-rw-rw-r-- 1 big big     22995 mai 18 18:46  Pre-Computing.ipynb
drwxr-xr-x 5 big big      4096 mai 18 17:32  spark-warehouse
A3AF8FFZAZYNE5,0000000078,5.0,1092182400
A2X4DOBWXXTX4A,1615725415,5.0,1327536000
A202HM75ZHSEGJ,B0059XTUB8,5.0,1335830400
A2V1XSFJL9BI3,0687038014,4.0,1312502400
A10O53IQXHUTMK,B0069QFDRO,5.0,1382054400
A124STU3GXMCGV,0131985701,5.0,1316390400


In [9]:
df_items = spark.read.csv("amazon_item_ratings.csv", header=False, inferSchema=True, sep=",") 

In [10]:
df_items = df_items.sample(fraction=0.001)

In [11]:
df_items.printSchema()
df_items.count()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: integer (nullable = true)



16304

In [12]:
#Check null values
df_items.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_items.columns]).show()

+---+---+---+---+
|_c0|_c1|_c2|_c3|
+---+---+---+---+
|  0|  0|  0|  0|
+---+---+---+---+



In [13]:
df_items.show(10, truncate=False)

+--------------+----------+---+----------+
|_c0           |_c1       |_c2|_c3       |
+--------------+----------+---+----------+
|AO6UTGZMS4FJ0 |B000F9YN22|5.0|1353888000|
|A2428WP7FV83IV|0870003844|5.0|1295049600|
|A2P1XEYTORCBFK|B00BI4J0S0|2.0|1362441600|
|A2UXRIBP5ZPEM4|B000W275JK|5.0|1352332800|
|ARMVAHWQQ9S8A |0026009102|5.0|1270166400|
|A24VXHFVPBO0UJ|B000GYSZ8Y|5.0|1293753600|
|A3TC3BCKUYYIKX|0192880039|5.0|1190505600|
|A2S6XTE1VWOJKP|B009SD9IB2|4.0|1395273600|
|A1F9QCTB7SAOR |B001USNEDQ|5.0|1351123200|
|A3G6JEPMHP8H1W|B0069SQSQ2|5.0|1355961600|
+--------------+----------+---+----------+
only showing top 10 rows



In [14]:
df_items.describe("_c2").show()


+-------+------------------+
|summary|               _c2|
+-------+------------------+
|  count|             16304|
|   mean| 4.170571638861629|
| stddev|1.2523336560631257|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+



In [15]:
indexers = [StringIndexer(inputCol="_c1", outputCol="Item_Index") , StringIndexer(inputCol="_c0", outputCol="User_Index")]

pipeline = Pipeline(stages=indexers)
df_items_indexed = pipeline.fit(df_items).transform(df_items)

df_items_indexed.show()

+--------------+----------+---+----------+----------+----------+
|           _c0|       _c1|_c2|       _c3|Item_Index|User_Index|
+--------------+----------+---+----------+----------+----------+
| AO6UTGZMS4FJ0|B000F9YN22|5.0|1353888000|    6028.0|   14777.0|
|A2428WP7FV83IV|0870003844|5.0|1295049600|    2433.0|    4873.0|
|A2P1XEYTORCBFK|B00BI4J0S0|2.0|1362441600|   14287.0|    7321.0|
|A2UXRIBP5ZPEM4|B000W275JK|5.0|1352332800|    6979.0|    8001.0|
| ARMVAHWQQ9S8A|0026009102|5.0|1270166400|      81.0|   15175.0|
|A24VXHFVPBO0UJ|B000GYSZ8Y|5.0|1293753600|    6194.0|    4962.0|
|A3TC3BCKUYYIKX|0192880039|5.0|1190505600|     775.0|   11981.0|
|A2S6XTE1VWOJKP|B009SD9IB2|4.0|1395273600|   13697.0|    7703.0|
| A1F9QCTB7SAOR|B001USNEDQ|5.0|1351123200|    8271.0|    1875.0|
|A3G6JEPMHP8H1W|B0069SQSQ2|5.0|1355961600|   11937.0|   10479.0|
|A2V3WNZ06H31II|0446678805|5.0|1367366400|    1482.0|    8025.0|
| ASYKUKD5MW46V|B004RJQ9X2|3.0|1370649600|   10756.0|      77.0|
| A4G1D7DU005YO|B008JGSM6

In [None]:
#indexerItem = StringIndexer(inputCol="_c1", outputCol="Item_Index")
#df_items_index_users = indexerItem.fit(df_items).transform(df_items)
#df_items_index_users.select("_c1","Item-Index").show(truncate=False)

#indexerUsers = StringIndexer(inputCol="_c0", outputCol="User-Index")
#df_items_indexed = indexerUsers.fit(df_items_index_users).transform(df_items_index_users)
#df_items_indexed.select("_c0","User-Index").show(truncate=False)

In [13]:
del df_items

In [12]:
df_items_indexed.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- Item_Index: double (nullable = false)
 |-- User_Index: double (nullable = false)



In [14]:
df_items_indexed = df_items_indexed.drop("_c0","_c1") 

In [15]:
df_items_indexed.show(10, truncate=False)

+---+----------+----------+----------+
|_c2|_c3       |Item_Index|User_Index|
+---+----------+----------+----------+
|1.0|1383177600|13641.0   |2468.0    |
|5.0|1399766400|87.0      |7173.0    |
|3.0|1362355200|2141.0    |5774.0    |
|5.0|1362355200|12924.0   |13106.0   |
|4.0|1388102400|6212.0    |6630.0    |
|5.0|1356220800|4334.0    |4996.0    |
|4.0|1403049600|8442.0    |9019.0    |
|5.0|1388966400|5493.0    |8828.0    |
|1.0|1337817600|12333.0   |9136.0    |
|4.0|1228521600|940.0     |15225.0   |
+---+----------+----------+----------+
only showing top 10 rows



In [69]:
df_items_indexed.printSchema()

root
 |-- _c2: double (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- Item_Index: double (nullable = false)
 |-- User_Index: double (nullable = false)



In [70]:
output_items = "items.parquet"
df_items_indexed.write.mode("overwrite").parquet(output_items)

In [71]:
df_items_indexed.write.mode("overwrite").saveAsTable("ItemsTable")