## Зададим начальные настройки

In [1]:
#pers
k = 20
z = 1.96

## Создадим контекст

In [2]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
sc.getConf().getAll()

[('spark.history.kerberos.keytab', 'none'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.dynamicAllocation.maxExecutors', '14'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.history.fs.cleaner.interval', '7d'),
 ('spark.shuffle.io.serverThreads', '128'),
 ('spark.executor.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),
 ('spark.executorEnv.PYTHONPATH',
  '{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.7-src.zip'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',
  'http://master.cluster-lab.com:8088/proxy/application_1558598905289_0058'),
 ('spark.shuffle.file.buffer', '1m'),
 ('spark.sql.hive.convertMetastoreOrc', 'true'),
 ('spark.driver.appUIAddress', 'http://mast

## Импортируем библиотеки

In [None]:
import pandas as pd
import os
import numpy as np
import json,codecs
import math
from pyspark.sql import functions as F
from pyspark.sql.functions import col, expr, when, desc,sum,count,udf

from pyspark.sql import SQLContext


## Загружаем в RDD

In [5]:
lab_data = '/labs/lab06data/ml-100k'

data = sc.textFile(os.path.join(lab_data,'u.data')).map(lambda x: x.split("\t"))
item = sc.textFile(os.path.join(lab_data,'u.item')).map(lambda x: x.split("|"))


## Преобразуем в DataFrame и подписываем данные

In [6]:
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

schema = StructType([StructField(str(i), StringType(), True) for i in ('user id','item id','rating','timestamp')])
data = sqlContext.createDataFrame(data, schema)

schema = StructType([StructField(str(i), StringType(), True) for i in ('movie id','movie title','release date','video release date',\
                          'IMDb URL','unknown','Action','Adventure','Animation',\
                          'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',\
                          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',\
                          'Thriller','War','Western')])
item = sqlContext.createDataFrame(item, schema)

## Создаем временные таблицы для обращения через SQL

In [7]:
data.createOrReplaceTempView("data")
item.createOrReplaceTempView("item")

In [8]:
data.limit(5).toPandas().head()

Unnamed: 0,user id,item id,rating,timestamp
0,42,523,5,881107375
1,286,741,4,876521887
2,436,1058,4,887770547
3,514,48,4,875318114
4,621,584,5,874965094


In [9]:
item.limit(5).toPandas().head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Считаем формулы

In [10]:
rate_avg=data.select(sum('rating')).collect()[0][0]/data.select(count('user id')).collect()[0][0]
rate_avg

3.52986

In [11]:
#data=data.withColumn('positive', expr("IF(rating >=4,1,0)"))
sqlContext = SQLContext(sc)
ratings = sqlContext.sql('with src as (\
                      SELECT `user id`,`item id`,`rating`, IF(rating >=4,1,0) as positive,{1} as k,{2} as rate_avg,{3} as z \
                      FROM data),\
                     agg as (select `item id`,\
                                     count(`user id`) as count,\
                                       sum(`rating`)/count(`user id`) as avg_rating,\
                                       sum(positive) as pos_cnt,\
                                       sum(positive) / count(`user id`) as pos_perc,\
                                       (sum(`rating`) + k*rate_avg) / (count(`user id`) + k) as rate_corr,\
                                       k,\
                                       rate_avg,\
                                       z \
                                from src\
                                group by `item id`,k,rate_avg,z)\
                    select `movie title`,\
                           int(`item id`) as `item id`,\
                           count, avg_rating, pos_cnt, pos_perc, rate_corr,\
                           (pos_perc  + z*z/(2*count) - z*sqrt(pos_perc *(1-pos_perc )/count +z*z/(4*count*count)))/(1+z*z/count) as int_low \
                    from agg join item on agg.`item id`=item.`movie id`'.format(1,k,rate_avg,z))
#ratings.createOrReplaceTempView("ratings")
ratings.limit(5).toPandas()

Unnamed: 0,movie title,item id,count,avg_rating,pos_cnt,pos_perc,rate_corr,int_low
0,Top Hat (1935),1203,21,4.047619,15,0.714286,3.795054,0.500432
1,"Last Klezmer: Leopold Kozlowski, His Life and ...",1331,4,3.0,2,0.5,3.44155,0.150036
2,Nightwatch (1997),1625,1,4.0,1,1.0,3.552248,0.206543
3,Little City (1998),1656,2,3.5,1,0.5,3.527145,0.094529
4,Wag the Dog (1997),347,137,3.510949,78,0.569343,3.513358,0.485651


## Проверяем результат

In [12]:
ratings.sort(desc('count'),'movie title').limit(10).toPandas()

Unnamed: 0,movie title,item id,count,avg_rating,pos_cnt,pos_perc,rate_corr,int_low
0,Star Wars (1977),50,583,4.358491,501,0.859348,4.331007,0.828769
1,Contact (1997),258,509,3.803536,344,0.675835,3.793189,0.633986
2,Fargo (1996),100,508,4.155512,406,0.799213,4.131813,0.76219
3,Return of the Jedi (1983),181,507,4.00789,379,0.747535,3.989748,0.707954
4,Liar Liar (1997),294,485,3.156701,197,0.406186,3.17148,0.363379
5,"English Patient, The (1996)",286,481,3.656965,298,0.619543,3.651891,0.575369
6,Scream (1996),288,478,3.441423,246,0.514644,3.444974,0.469901
7,Toy Story (1995),1,452,3.878319,321,0.710177,3.863553,0.66672
8,Air Force One (1997),300,431,3.63109,252,0.584687,3.626601,0.537616
9,Independence Day (ID4) (1996),121,429,3.438228,228,0.531469,3.44231,0.484177


In [13]:
ratings.sort(desc('avg_rating'),'movie title').limit(10).toPandas()

Unnamed: 0,movie title,item id,count,avg_rating,pos_cnt,pos_perc,rate_corr,int_low
0,Aiqing wansui (1994),1536,1,5.0,1,1.0,3.599867,0.206543
1,Entertaining Angels: The Dorothy Day Story (1996),1653,1,5.0,1,1.0,3.599867,0.206543
2,"Great Day in Harlem, A (1994)",814,1,5.0,1,1.0,3.599867,0.206543
3,Marlene Dietrich: Shadow and Light (1996),1201,1,5.0,1,1.0,3.599867,0.206543
4,Prefontaine (1997),1189,3,5.0,3,1.0,3.721617,0.438494
5,"Saint of Fort Washington, The (1993)",1467,2,5.0,2,1.0,3.663509,0.342372
6,Santa with Muscles (1996),1500,2,5.0,2,1.0,3.663509,0.342372
7,Someone Else's America (1995),1599,1,5.0,1,1.0,3.599867,0.206543
8,Star Kid (1997),1293,3,5.0,3,1.0,3.721617,0.438494
9,They Made Me a Criminal (1939),1122,1,5.0,1,1.0,3.599867,0.206543


In [14]:
ratings.sort(desc('rate_corr'),'movie title').limit(10).toPandas()

Unnamed: 0,movie title,item id,count,avg_rating,pos_cnt,pos_perc,rate_corr,int_low
0,Schindler's List (1993),318,298,4.466443,265,0.889262,4.407538,0.84856
1,Casablanca (1942),483,243,4.45679,216,0.888889,4.386301,0.843166
2,"Shawshank Redemption, The (1994)",64,283,4.44523,255,0.90106,4.384809,0.86072
3,"Close Shave, A (1995)",408,112,4.491071,100,0.892857,4.345433,0.822018
4,Star Wars (1977),50,583,4.358491,501,0.859348,4.331007,0.828769
5,"Wrong Trousers, The (1993)",169,118,4.466102,105,0.889831,4.330414,0.820601
6,"Usual Suspects, The (1995)",12,267,4.385768,232,0.868914,4.326123,0.823148
7,Rear Window (1954),603,209,4.38756,185,0.885167,4.312652,0.834823
8,"Silence of the Lambs, The (1991)",98,390,4.289744,344,0.882051,4.252676,0.846252
9,"Godfather, The (1972)",127,413,4.283293,351,0.849879,4.248492,0.812213


In [15]:
ratings.sort(desc('int_low'),'movie title').limit(10).toPandas()

Unnamed: 0,movie title,item id,count,avg_rating,pos_cnt,pos_perc,rate_corr,int_low
0,"Shawshank Redemption, The (1994)",64,283,4.44523,255,0.90106,4.384809,0.86072
1,Vertigo (1958),479,179,4.251397,162,0.905028,4.17888,0.853178
2,Schindler's List (1993),318,298,4.466443,265,0.889262,4.407538,0.84856
3,"Silence of the Lambs, The (1991)",98,390,4.289744,344,0.882051,4.252676,0.846252
4,Casablanca (1942),483,243,4.45679,216,0.888889,4.386301,0.843166
5,Rear Window (1954),603,209,4.38756,185,0.885167,4.312652,0.834823
6,To Kill a Mockingbird (1962),427,219,4.292237,193,0.881279,4.22844,0.83173
7,Star Wars (1977),50,583,4.358491,501,0.859348,4.331007,0.828769
8,One Flew Over the Cuckoo's Nest (1975),357,264,4.291667,230,0.871212,4.238018,0.82542
9,"Usual Suspects, The (1995)",12,267,4.385768,232,0.868914,4.326123,0.823148


## Извлечение результатов

In [16]:
output = dict()
output["top10_rates"] = ratings.sort(desc('count'),'movie title').limit(10).select('item id').rdd.flatMap(lambda x: x).collect()
output["top10_average"] = ratings.sort(desc('avg_rating'),'movie title').limit(10).select('item id').rdd.flatMap(lambda x: x).collect()
output["top10_rating"] = ratings.sort(desc('rate_corr'),'movie title').limit(10).select('item id').rdd.flatMap(lambda x: x).collect()
output["top10_lower"] = ratings.sort(desc('int_low'),'movie title').limit(10).select('item id').rdd.flatMap(lambda x: x).collect()
#data["hist_all"] = list(ratings.hist_all)
output

{'top10_rates': [50, 258, 100, 181, 294, 286, 288, 1, 300, 121],
 'top10_average': [1536, 1653, 814, 1201, 1189, 1467, 1500, 1599, 1293, 1122],
 'top10_rating': [318, 483, 64, 408, 50, 169, 12, 603, 98, 127],
 'top10_lower': [64, 479, 318, 98, 483, 603, 427, 50, 357, 12]}

In [17]:
with open('/data/home/sergey.antonov/lab06s.json','w',encoding='UTF-8') as data_file:    
     json.dump(output, data_file)

In [18]:
sc.stop()