In [1]:
!nvidia-smi

Tue May 11 09:57:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# getting JDK because Spark is developed using Scala which requires Java runtime environment
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# downloading and unzipping the Spark
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz

# findspark is a utility that automatically set all the os path and initialize the spark context
!pip install -q findspark

# if you want to use cuda. make sure the version is correct as above
!wget https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-1.jar

# rapids for spark is nvidia's framework to train ml models on gpu
!wget http://insecure.repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0/rapids-4-spark_2.12-0.1.0.jar


In [3]:
# downloading xgboost
# !wget https://repo1.maven.org/maven2/com/nvidia/xgboost4j_3.0/1.0.0-0.1.0/xgboost4j_3.0-1.0.0-0.1.0.jar   
# !wget https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.0.0-0.1.0/xgboost4j-spark_3.0-1.0.0-0.1.0.jar  

--2021-05-11 09:47:44--  https://repo1.maven.org/maven2/com/nvidia/xgboost4j_3.0/1.0.0-0.1.0/xgboost4j_3.0-1.0.0-0.1.0.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231556205 (221M) [application/java-archive]
Saving to: ‘xgboost4j_3.0-1.0.0-0.1.0.jar’


2021-05-11 09:47:46 (224 MB/s) - ‘xgboost4j_3.0-1.0.0-0.1.0.jar’ saved [231556205/231556205]

--2021-05-11 09:47:46--  https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.0.0-0.1.0/xgboost4j-spark_3.0-1.0.0-0.1.0.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2040779 (1.9M) [application/java-archive]
Saving to: ‘xgboost4j-spark_3.0-1.0.0-0.1.0.jar’


2021-05-11 09:47:46 (61.8 MB/s

In [11]:
# before doing findspark
import os 
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" # set java home
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7" # set spark home. it's the file we downloaded and unpacked just now

# transfer all the jar file to the cluster
# usually we list this in the command line when we submit spark jobs. 
# however, since we are working interactively with spark, we set it here
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /content/cudf-0.14-cuda10-1.jar,/content/xgboost4j_3.0-1.0.0-0.1.0.jar,/content/xgboost4j-spark_3.0-1.0.0-0.1.0.jar,/content/rapids-4-spark_2.12-0.1.0.jar pyspark-shell'

In [6]:
import findspark 
findspark.init() # go to the spark and java home to initiate the environment

In [7]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7


In [8]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [15]:
spark = SparkSession.builder.master("local[*]").\
        config("spark.plugins", "com.nvidia.spark.SQLPlugin").\
        config("spark.rapids.memory.gpu.pooling.enabled", False).\
        getOrCreate()
# sc = SparkContext(master="local", appName="New Spark Context")
# spark = SparkSession(sc)

In [16]:
df = spark.read.csv("ratings_Beauty.csv", header=False, inferSchema=True)
df = df.withColumnRenamed('_c1', 'product_id')
df = df.withColumnRenamed('_c0', 'user_id')
df = df.withColumnRenamed('_c2', 'label')
df = df.select('product_id', 'user_id', 'label')
df.show(5)

+----------+--------------+-----+
|product_id|       user_id|label|
+----------+--------------+-----+
|0205616461|A39HTATAQ9V7YF|  5.0|
|0558925278|A3JM6GV9MNOF9X|  3.0|
|0558925278|A1Z513UWSAAO0F|  5.0|
|0733001998|A1WMRR494NWEWV|  4.0|
|0737104473|A3IAAVS479H7M7|  1.0|
+----------+--------------+-----+
only showing top 5 rows



In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.pipeline import Pipeline

indexer_product = StringIndexer(inputCol='product_id', outputCol='product_idx')
indexer_user = StringIndexer(inputCol='user_id', outputCol='user_idx')

pre_pipeline = Pipeline(stages=[indexer_product, indexer_user])
pre_pipeline_fitted = pre_pipeline.fit(df)
final_df = pre_pipeline_fitted.transform(df)

final_df.show(5)

+----------+--------------+-----+-----------+--------+
|product_id|       user_id|label|product_idx|user_idx|
+----------+--------------+-----+-----------+--------+
|0205616461|A39HTATAQ9V7YF|  5.0|   145790.0| 70392.0|
|0558925278|A3JM6GV9MNOF9X|  3.0|   103581.0|265306.0|
|0558925278|A1Z513UWSAAO0F|  5.0|   103581.0|552933.0|
|0733001998|A1WMRR494NWEWV|  4.0|   145791.0|536779.0|
|0737104473|A3IAAVS479H7M7|  1.0|   145792.0| 14679.0|
+----------+--------------+-----+-----------+--------+
only showing top 5 rows



In [18]:
train_df, test_df = final_df.randomSplit([0.9, 0.1], seed=42)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

als = ALS(maxIter=10,           # Number of iterations
          regParam=0.1,        # Regularization parameter beta
          rank=20,              # Number of features
          numItemBlocks=10,     # Number partitioned to parallelize computation
          alpha=0.001,            # Learning rate
          userCol='user_idx',     
          itemCol='product_idx',
          ratingCol='label')
model = als.fit(train_df)

import time
tic = time.time()
predictions = model.transform(test_df)
predictions.show(5)
evaluator = RegressionEvaluator(metricName='rmse')
rmse = evaluator.evaluate(predictions)
print('RMSE: {:.4f}'.format(rmse))

toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))