In [47]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.2.0 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.1.1
      /_/

Using Python version 2.7.6 (default, Oct 26 2016 20:30:19)
SparkSession available as 'spark'.


In [205]:
from pyspark.sql.functions import when, col, coalesce, array, udf

In [309]:
df_train = spark.read.format("csv").option("header","true").load("/labs/lab10data/lab10_train.csv")

In [310]:
df_test = spark.read.format("csv").option("header","true").load("/labs/lab10data/lab10_test.csv")

In [311]:
df_items = spark.read.format("csv") \
                .option("delimiter","\t") \
                .option("header","true") \
                .load("/labs/lab10data/lab10_items.csv")

In [312]:
df_train = df_train.join(df_items, df_train.item_id == df_items.item_id, 'inner').drop(df_items.item_id)
df_test = df_test.join(df_items, df_test.item_id == df_items.item_id, 'inner').drop(df_items.item_id)

In [313]:
df_train = df_train.filter(col("genres").isNotNull())
df_test = df_test.filter(col("genres").isNotNull())

In [314]:
from pyspark.sql.types import StringType

concat_udf = F.udf(lambda cols: ",".join([x if x is not None else "*" for x in cols]), StringType())
df_train = df_train.withColumn("concat_featurs",\
                concat_udf(array('user_id','genres')))
df_test = df_test.withColumn("concat_featurs",\
                concat_udf(array('user_id','genres')))

In [315]:
from pyspark.ml.linalg import Vectors

split_col = pyspark.sql.functions.split(df_train['concat_featurs'], ',')
df_train = df_train.withColumn("featurs_split", split_col)
split_col = pyspark.sql.functions.split(df_test['concat_featurs'], ',')
df_test = df_test.withColumn("featurs_split", split_col)

In [316]:
for value in df_train.select("featurs_split").distinct().head(1)[0][0]:
    print value

916157
Драмы
Криминал


In [317]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="featurs_split", outputCol="featurs_vec")

model = cv.fit(df_train)
df_train = model.transform(df_train)

model = cv.fit(df_test)
df_test = model.transform(df_test)

In [318]:
df_train = df_train.select('user_id','item_id','purchase','featurs_vec')#,'channel_id','content_type','year','region_id')
df_test = df_test.select('user_id','item_id','featurs_vec')#,'channel_id','content_type','year','region_id')

In [319]:
df_train = df_train.withColumn('purchase', col('purchase').cast('integer'))

In [320]:
df_train.show(5)

+-------+-------+--------+--------------------+
|user_id|item_id|purchase|         featurs_vec|
+-------+-------+--------+--------------------+
| 885512| 100140|       0|(2024,[2,1010],[1...|
| 885581| 100140|       0|(2024,[2,1519],[1...|
| 885864| 100140|       0|(2024,[2,1983],[1...|
| 886006| 100140|       0|(2024,[2,1689],[1...|
| 886038| 100140|       0|(2024,[2,1766],[1...|
+-------+-------+--------+--------------------+
only showing top 5 rows



In [321]:
df_train.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- featurs_vec: vector (nullable = true)



In [322]:
train, test = df_train.randomSplit([0.8, 0.2], seed=12345)
train.cache()

DataFrame[user_id: string, item_id: string, purchase: int, featurs_vec: vector]

In [323]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

rf = RandomForestClassifier(numTrees=3, maxDepth=4, labelCol="purchase", featuresCol="featurs_vec", seed=42)
model = rf.fit(train)
model.featureImportances

SparseVector(2024, {4: 0.0119, 7: 0.0205, 10: 0.0006, 14: 0.0001, 16: 0.0008, 19: 0.0986, 24: 0.0, 25: 0.0093, 26: 0.0022, 30: 0.014, 37: 0.0004, 38: 0.0002, 42: 0.0005, 54: 0.0006, 86: 0.0305, 111: 0.0707, 239: 0.0032, 621: 0.0096, 718: 0.0676, 782: 0.023, 818: 0.0793, 866: 0.0698, 910: 0.0102, 1300: 0.0011, 1762: 0.4137, 1813: 0.0156, 1974: 0.0459, 2017: 0.0})

In [324]:
predictions = model.transform(test)
predictions.select("prediction", "purchase")
# Select (prediction, true label) and compute test error
evaluator = BinaryClassificationEvaluator(labelCol="purchase", rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print accuracy

0.5


In [325]:
predictions.select("prediction").distinct().collect()

[Row(prediction=0.0)]

In [184]:
from pyspark.sql.types import FloatType
split_udf = udf(lambda value: value[1].item(), FloatType())
result = model.transform(df_test).withColumn('purchase', split_udf('probability'))

In [185]:
result = result.sort(result.user_id,result.item_id)
result = result.select('user_id', 'item_id','purchase')

In [186]:
result.show(5)

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
|   1654|    336|0.0021516618|
|   1654|    678|0.0021516618|
|   1654|    691|0.0021516618|
|   1654|    696|0.0021516618|
|   1654|    763|0.0021516618|
+-------+-------+------------+
only showing top 5 rows



In [331]:
result.coalesce(1).write.mode("overwrite").option("header", "true").csv("lab10.csv")
!hadoop fs -getmerge lab10.csv ~/lab10.csv

In [188]:
cat ~/lab10.csv | head -n10

user_id,item_id,purchase
1654,336,0.0021516618
1654,678,0.0021516618
1654,691,0.0021516618
1654,696,0.0021516618
1654,763,0.0021516618
1654,795,0.0021516618
1654,861,0.0021516618
1654,1137,0.0021516618
1654,1159,0.0021516618
cat: write error: Broken pipe


In [192]:
import pandas as pd
pd.read_csv('~/lab10.csv').purchase.unique()

array([ 0.00215166,  0.00193721,  0.00168565,  0.0016492 ,  0.0019519 ,
        0.00153423,  0.00148588,  0.00312257,  0.00332233,  0.00944003,
        0.00106822,  0.00512769])

In [326]:
cp /tmp/uaa/lab10.csv ~/lab10.csv_1

In [327]:
ls ..

[0m[01;34mBankScoring[0m/          lab07s.json            part-00000
[01;34mClassRecomend[0m/        lab08.json             project01_gender-age.csv
hadoop-streaming.jar  lab08s.json            [01;34mRaiting[0m/
[01;34mHiveUserProfile[0m/      lab09.csv              [01;34mRecSysMovieALS[0m/
lab03s_domains.txt    lab09s.csv             [01;34mRecSysMovies[0m/
lab03_users.txt       lab10.csv              [01;34mRecSysProj[0m/
lab04.csv             lab10.csv_1            [01;34mRecSysPurchesFilm[0m/
lab04s.csv            Lesson_1.ipynb         [01;34mSpark[0m/
lab05.json            [01;34mMapReducePredAge[0m/      stdout
lab06.json            [01;34mMapReduceTop350[0m/       [01;34mTwoTextCompare_Sentiment[0m/
lab06s.json           [01;34mMapReduceWordCount[0m/
lab07.json            [01;34mMapReduseLoadToHBase[0m/


In [328]:
cat ~/lab10.csv_1 | head -n10

user_id,item_id,purchase
1654,336,0.0102475
1654,678,0.00931158
1654,691,0.00962465
1654,696,0.00986544
1654,763,0.0109576
1654,795,0.0144923
1654,861,0.0107935
1654,1137,0.011556
1654,1159,0.00992513
cat: write error: Broken pipe


In [2]:
cp ~/lab10.csv_1 ~/lab10s.csv 

In [333]:
cat ~/lab10.csv | head -n10

user_id,item_id,purchase
1654,336,0.0021516618
1654,678,0.0021516618
1654,691,0.0021516618
1654,696,0.0021516618
1654,763,0.0021516618
1654,795,0.0021516618
1654,861,0.0021516618
1654,1137,0.0021516618
1654,1159,0.0021516618
cat: write error: Broken pipe
