## Зададим начальные настройки

In [1]:
#pers

## Создадим контекст

In [2]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
spark.conf.set("spark.sql.shuffle.partitions", 50)
spark.conf.set('spark.executor.cores', 4)
sc.setCheckpointDir('checkpoint/')

In [4]:
sc.getConf().getAll()

[('spark.history.kerberos.keytab', 'none'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.dynamicAllocation.maxExecutors', '14'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.history.fs.cleaner.interval', '7d'),
 ('spark.shuffle.io.serverThreads', '128'),
 ('spark.executor.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),
 ('spark.driver.port', '39985'),
 ('spark.executorEnv.PYTHONPATH',
  '{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.7-src.zip'),
 ('spark.shuffle.file.buffer', '1m'),
 ('spark.sql.hive.convertMetastoreOrc', 'true'),
 ('spark.yarn.historyServer.address', 'master.cluster-lab.com:18081'),
 ('spark.sql.autoBroadcastJoinThreshold', '26214400'),
 ('spark.ui.filters',
  'org.apache.hadoop.yarn.server

## Импортируем библиотеки

In [5]:
import pandas as pd
import os
import numpy as np
import json,codecs
import math
from pyspark.sql import functions as f
from pyspark.sql.functions import col, expr, when, desc,sum,count,udf,countDistinct
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## Загружаем в DataFrame и подписываем данные

In [6]:
lab_data = '/labs/lab09data/'

schema = StructType([StructField(str(i), j, True) for i,j in zip(('userId','movieId','rating'),(IntegerType(),IntegerType(),DoubleType()))])
train = spark.read.csv(os.path.join(lab_data,'train.csv'),schema=schema,sep=",",header = True)

#schema = StructType([StructField(str(i), j, True) for i,j in zip(('userId','movieId','tags'),(IntegerType(),IntegerType(),StringType()))])
#tags = spark.read.csv(os.path.join(lab_data,'tags.csv'),schema=schema,sep=",",header = True)

#schema = StructType([StructField(str(i), j, True) for i,j in zip(('movieId','title','genres'),(IntegerType(),StringType(),StringType()))])
#movies = spark.read.csv(os.path.join(lab_data,'movies.csv'),schema=schema,sep=",",header = True)

#schema = StructType([StructField(str(i), j, True) for i,j in zip(('movieId','imdbId','tmdbId'),(IntegerType(),StringType(),StringType()))])
#links = spark.read.csv(os.path.join(lab_data,'links.csv'),schema=schema,sep=",",header = True)

schema = StructType([StructField(str(i), j, True) for i,j in zip(('userId','movieId','rating'),(IntegerType(),IntegerType(),DoubleType()))])
test = spark.read.csv(os.path.join(lab_data,'test.csv'),schema=schema,sep=",",header = True)

## Создаем временные таблицы для обращения через SQL

In [7]:
train = train.repartition(4).cache()
#tags = tags.repartition(4).cache()
#movies = movies.repartition(4).cache()
#links = links.repartition(4).cache()
test = test.repartition(4).cache()

In [8]:
train.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating
0,127374,11181,4.0
1,197777,13429,4.5
2,122271,19058,5.0
3,142930,11854,3.0
4,206377,21728,3.5


In [9]:
#tags.limit(5).toPandas()

In [10]:
#movies.limit(5).toPandas()

In [11]:
#links.limit(5).toPandas()

In [12]:
test.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating
0,172982,19123,0.0
1,185782,5639,0.0
2,175982,24293,0.0
3,168451,6203,0.0
4,221456,25894,0.0


## Обучим модель

In [13]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=40, regParam=0.07, rank=50 , userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(train)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [14]:
paramGrid = ParamGridBuilder().addGrid(als.maxIter, [20, 30, 40])\
                              .addGrid(als.rank, [50, 75, 100])\
                              .addGrid(als.regParam, [0.03, 0.05, 0.07])\
                              .build()

In [15]:
crossval = CrossValidator(estimator=als, estimatorParamMaps=paramGrid,
                          evaluator=evaluator, numFolds=3, parallelism=99)

In [None]:
cv_model = crossval.fit(train)

In [17]:
cv_model.avgMetrics

[0.9216894199694399,
 0.8651745398143187,
 0.8435073569347568,
 0.9264123317440429,
 0.866988759794578,
 0.8442870062876049,
 0.9265173101969766,
 0.867807067527653,
 0.8450076412030472,
 0.9053741122592653,
 0.8554760904046129,
 0.8381140200343042,
 0.9075050084825871,
 0.8559874304346491,
 0.8385056773940537,
 0.9062908877472091,
 0.8559157266918556,
 0.8387939616954758,
 0.8968536754413168,
 0.851383794898473,
 0.8362002519212042,
 0.8975834928749953,
 0.8513727711530845,
 0.8364791793543016,
 0.8956230639743243,
 0.850916346500108,
 0.8366150334789586]

In [18]:
cv_model.bestModel.rank

50

In [19]:
(cv_model.bestModel
    ._java_obj     # Get Java object
    .parent()      # Get parent (ALS estimator)
    .getMaxIter()) # Get maxIter

40

In [20]:
(cv_model.bestModel
    ._java_obj     # Get Java object
    .parent()      # Get parent (ALS estimator)
    .getRegParam()) # Get maxIter

0.07

In [21]:
predictions = cv_model.bestModel.transform(test)

## Извлечение результатов

In [22]:
# для начала посчитаем среднее для подстановки пропущенных результатов модели
predictions.createOrReplaceTempView('predictions')
avg = spark.sql('select avg(prediction) as rating from predictions where prediction !=\'NaN\'')
avg.toPandas()

Unnamed: 0,rating
0,3.422784


In [23]:
# Заполним пропуски
predictions.createOrReplaceTempView('predictions')
test.createOrReplaceTempView('test')
output = spark.sql('select t.userId,t.movieId, round (nvl(p.prediction,3.4),1) as rating \
                    from test t left join predictions p on p.userId=t.userId and p.movieId=t.movieId\
                    order by t.userId, t.movieId').coalesce(1)

In [24]:
output.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating
0,1,1414,3.8
1,1,2346,3.7
2,1,5278,3.0
3,1,9303,4.1
4,1,11817,4.5


In [25]:
 output.toPandas().to_csv('/data/home/sergey.antonov/lab09.csv',index=False)

In [26]:
output.toPandas().to_csv('/data/home/sergey.antonov/lab09s.csv',index=False)

In [62]:
sc.stop()