In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

In [4]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType
from pyspark.sql import functions as F

In [6]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, HashingTF
from pyspark.ml import Pipeline

# train

In [7]:
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("purchase", IntegerType(), True),
])    

In [8]:
df_user = spark.read.csv("/labs/slaba03/laba03_train.csv", schema=schema, header=True, multiLine=True, escape='"')

In [9]:
df_user.show()

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
|   1654|  66187|       0|
|   1654|  84350|       0|
|   1654|  92854|       0|
|   1654|  72811|       0|
|   1654|  86876|       0|
|   1654| 102657|       0|
|   1654| 100482|       0|
|   1654|  89677|       0|
|   1654|  99419|       0|
|   1654|  66603|       0|
|   1654|   7363|       0|
|   1654|   1320|       0|
|   1654|  88892|       0|
|   1654|  66671|       0|
|   1654|  75925|       0|
+-------+-------+--------+
only showing top 20 rows



# test

In [10]:
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
]) 

In [11]:
df_user_test = spark.read.csv("/labs/slaba03/laba03_test.csv", schema=schema, header=True, multiLine=True, escape='"')

In [12]:
df_user_test.show()

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
|   1654|  88896|
|   1654|  67740|
|   1654|  74271|
|   1654|  99871|
|   1654|  78570|
|   1654|  71942|
|   1654|  74367|
|   1654|  98628|
|   1654|  95887|
|   1654|  77795|
|   1654|  75152|
|   1654|  74905|
|   1654|   9068|
|   1654|  72954|
|   1654| 102431|
+-------+-------+
only showing top 20 rows



# items

In [13]:
items_schema = StructType([
    StructField("item_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType(), nullable=True),
    StructField("year", FloatType(), nullable=True),
    StructField("genres", StringType()),
    StructField("region_id", IntegerType()),
])

In [14]:
df_items = spark.read.option('delimiter', '\t').csv("/labs/slaba03/laba03_items.csv", header=True, schema=items_schema, multiLine=True, escape='"')

In [15]:
df_items.show()

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+------+--------------------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|               title|  year|              genres|region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+--------------------+------+--------------------+---------+
|  65667|      null|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|на пробах только ...|2013.0|             Эротика|     null|
|  65669|      null|       1970-01-01T00:00:00Z|      2018-01-01T00:00:00Z|               null|              null|           1|скуби ду: эротиче...|2011.0|             Эротика|     null|
|  65668|      null|       1970-01-01T00:00:00Z|      2018-01-01T

In [16]:
df_items = df_items.filter(F.col("content_type") == 1)
df_items = df_items.select(F.regexp_replace('title', r',|\.|&|\\|\||–|_|:|-|\!|\?', '').alias('title'), 'item_id', 'year', 'genres')


In [17]:
from pyspark.sql.functions import split, when, array
df_items2 = df_items.withColumn("genres_word", split("genres", ","))
df_items2  = df_items2.withColumn('genres_word', when(df_items2['genres_word'].isNull(), array().cast("array<integer>")).otherwise(df_items2['genres_word']))

In [18]:
df_items2.show()

+--------------------+-------+------+--------------------+--------------------+
|               title|item_id|  year|              genres|         genres_word|
+--------------------+-------+------+--------------------+--------------------+
|на пробах только ...|  65667|2013.0|             Эротика|           [Эротика]|
|скуби ду эротичес...|  65669|2011.0|             Эротика|           [Эротика]|
|горячие девочки д...|  65668|2011.0|             Эротика|           [Эротика]|
|соблазнительницы ...|  65671|2011.0|             Эротика|           [Эротика]|
|секретные сексмат...|  65670|2010.0|             Эротика|           [Эротика]|
|      все о мужчинах|  65809|2016.0|             Комедии|           [Комедии]|
|8 лучших свиданий...|  65810|2016.0|   Комедии,Мелодрамы|[Комедии, Мелодрамы]|
|            византия|    326|2012.0|Ужасы,Триллеры,Др...|[Ужасы, Триллеры,...|
|девственники бере...|    336|2012.0|Ужасы,Комедии,Фан...|[Ужасы, Комедии, ...|
|   8 первых свиданий|    357|2012.0|Ком

In [19]:
stop_words = StopWordsRemover.loadDefaultStopWords("russian")  
tokenizer = Tokenizer(inputCol="title", outputCol="title_words")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="title_words_filtered", stopWords=stop_words)
#count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=10000)
#tfidf = IDF(inputCol="word_vector", outputCol="tfidf")    

hasher = HashingTF(numFeatures=200, binary=True, inputCol=swr.getOutputCol(), outputCol="word_vector")


preprocessing = Pipeline(stages=[
        tokenizer,
        swr,
        #count_vectorizer,
        #tfidf
        hasher
    ])
    
preprocessing_model = preprocessing.fit(df_items2)
df_items3 = preprocessing_model.transform(df_items2)

In [20]:
df_items3.show(2, vertical=True)

-RECORD 0------------------------------------
 title                | на пробах только ... 
 item_id              | 65667                
 year                 | 2013.0               
 genres               | Эротика              
 genres_word          | [Эротика]            
 title_words          | [на, пробах, толь... 
 title_words_filtered | [пробах, девушки,... 
 word_vector          | (200,[0,4,23,29,8... 
-RECORD 1------------------------------------
 title                | скуби ду эротичес... 
 item_id              | 65669                
 year                 | 2011.0               
 genres               | Эротика              
 genres_word          | [Эротика]            
 title_words          | [скуби, ду, эроти... 
 title_words_filtered | [скуби, ду, эроти... 
 word_vector          | (200,[17,33,99,10... 
only showing top 2 rows



In [30]:
count_vectorizer = CountVectorizer(inputCol='genres_word', outputCol='genre_vector')
items = count_vectorizer.fit(df_items3).transform(df_items3)

In [31]:
items.show()

+--------------------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               title|item_id|  year|              genres|         genres_word|         title_words|title_words_filtered|         word_vector|        genre_vector|
+--------------------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|на пробах только ...|  65667|2013.0|             Эротика|           [Эротика]|[на, пробах, толь...|[пробах, девушки,...|(200,[0,4,23,29,8...|     (83,[20],[1.0])|
|скуби ду эротичес...|  65669|2011.0|             Эротика|           [Эротика]|[скуби, ду, эроти...|[скуби, ду, эроти...|(200,[17,33,99,10...|     (83,[20],[1.0])|
|горячие девочки д...|  65668|2011.0|             Эротика|           [Эротика]|[горячие, девочки...|[горячие, девочки...|(200,[31,51,67,93...|     (83,[20],[1.0])|
|соблазнительниц

# обьединение фичей

In [46]:
from pyspark.ml.feature import VectorAssembler

In [50]:
assembler = VectorAssembler(inputCols=['word_vector', 'genre_vector'], outputCol="features")

In [51]:
df_train = assembler.transform(df_train)

In [52]:
df_train.show(vertical=True)

-RECORD 0------------------------------------
 item_id              | 8389                 
 user_id              | 520446               
 purchase             | 0                    
 title                | пес в сапогах (су... 
 year                 | 1981.0               
 genres               | Мультфильмы,Детск... 
 genres_word          | [Мультфильмы, Дет... 
 title_words          | [пес, в, сапогах,... 
 title_words_filtered | [пес, сапогах, (с... 
 word_vector          | (200,[58,84,187],... 
 genre_vector         | (83,[6,14,19,23],... 
 features             | (283,[58,84,187,2... 
-RECORD 1------------------------------------
 item_id              | 8389                 
 user_id              | 556825               
 purchase             | 0                    
 title                | пес в сапогах (су... 
 year                 | 1981.0               
 genres               | Мультфильмы,Детск... 
 genres_word          | [Мультфильмы, Дет... 
 title_words          | [пес, в, с

In [None]:
# from pyspark.ml.feature import MinMaxScaler
# scaler = MinMaxScaler(inputCol='features', outputCol='features_minmax')
# scaler_model = scaler.fit(items)
# items1 = scaler_model.transform(items)

# обьединяем train и items

In [48]:
df_train = df_user.join(items, on="item_id", how="inner")

In [49]:
df_train.show(2,vertical=True)

-RECORD 0------------------------------------
 item_id              | 8389                 
 user_id              | 520446               
 purchase             | 0                    
 title                | пес в сапогах (су... 
 year                 | 1981.0               
 genres               | Мультфильмы,Детск... 
 genres_word          | [Мультфильмы, Дет... 
 title_words          | [пес, в, сапогах,... 
 title_words_filtered | [пес, сапогах, (с... 
 word_vector          | (200,[58,84,187],... 
 genre_vector         | (83,[6,14,19,23],... 
-RECORD 1------------------------------------
 item_id              | 8389                 
 user_id              | 556825               
 purchase             | 0                    
 title                | пес в сапогах (су... 
 year                 | 1981.0               
 genres               | Мультфильмы,Детск... 
 genres_word          | [Мультфильмы, Дет... 
 title_words          | [пес, в, сапогах,... 
 title_words_filtered | [пес, сапо

# разбиваем на train и val

In [53]:
train = df_train.sampleBy("purchase", fractions={0: 0.8, 1: 0.8}, seed=5757)

In [54]:
val = df_train.join(train, on="user_id", how="leftanti")

In [55]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(featuresCol='features', labelCol="purchase")

In [56]:
pipeline_model = gbt.fit(train)

Py4JJavaError: An error occurred while calling o1275.fit.
: org.apache.spark.SparkException: Job 105 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:954)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:952)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:952)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2164)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2077)
	at org.apache.spark.SparkContext$$anonfun$stop$6.apply$mcV$sp(SparkContext.scala:1949)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1948)
	at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:121)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:567)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:201)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:129)
	at org.apache.spark.ml.regression.DecisionTreeRegressor$$anonfun$train$2.apply(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:124)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:330)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:55)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:206)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$train$1.apply(GBTClassifier.scala:156)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:156)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:58)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
predictions = pipeline_model.transform(val)

In [None]:
predictions.show(4)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions)

In [None]:
sc.stop()