In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# context

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 4 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("lab_asd")
         .getOrCreate())

# lab02

In [4]:
courses = [23126, 21617, 16627, 11556, 16704, 13702]

In [14]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json").select("id", "lang", "name", F.lower(F.regexp_replace('desc', r'[^\pL{0-9}\p{Space}]', '')).alias('description'))

In [7]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml import Pipeline

In [5]:
@F.udf(FloatType())
def cos_sim(v, u):
    return float(v.dot(u) / (v.norm(2) * u.norm(2)))

In [15]:
result_dict = {}
for course in courses:
    course_lang = df.filter(F.col("id") == course).collect()[0]["lang"]
    if course_lang == "en":
        stop_words = StopWordsRemover.loadDefaultStopWords("english")        
    elif course_lang == "es":
        stop_words = StopWordsRemover.loadDefaultStopWords("spanish")
    elif course_lang == "ru":
        stop_words = StopWordsRemover.loadDefaultStopWords("russian")   
    
    lang_df = df.filter(df.lang == course_lang)
    tokenizer = Tokenizer(inputCol="description", outputCol="words")
    swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)
    count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=10000)
    tfidf = IDF(inputCol="word_vector", outputCol="tfidf")
    
    preprocessing = Pipeline(stages=[
        tokenizer,
        swr,
        count_vectorizer,
        tfidf
    ])
    
    preprocessing_model = preprocessing.fit(lang_df)
    preprocessed_dataset = preprocessing_model.transform(lang_df)
    
    result = (
        preprocessed_dataset
        .crossJoin(preprocessed_dataset.filter(F.col("id") == course).select(F.col("tfidf").alias("vector")))
        .select('*', cos_sim('tfidf', 'vector').alias('cosine'))
        .filter("cosine <> 'NaN'")
        .select('*', F.row_number().over(Window.partitionBy("vector").orderBy(F.col('cosine').desc(), 
                                                                              F.col('name'),
                                                                              F.col('id'))).alias('rn'))
        .filter("rn <= 11 AND rn > 1")
        .orderBy("rn")
    )
    
    result_array = result.select("id").rdd.flatMap(lambda x: x).collect()
    result_dict[course] = result_array

In [16]:
result_dict

{23126: [14760, 13665, 13782, 15909, 19270, 25782, 13348, 17499, 25071, 7153],
 21617: [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21700, 21508],
 16627: [11431, 5687, 12247, 17964, 5558, 16694, 17961, 12660, 9598, 11575],
 11556: [16488, 13461, 11523, 468, 22710, 10447, 23357, 11529, 19330, 9465],
 16704: [1228, 1327, 20362, 18331, 26980, 1365, 1247, 8186, 1236, 20645],
 13702: [13702, 21079, 8123, 1041, 1396, 22053, 17076, 8082, 1033, 1052]}

In [17]:
with open('lab02.json', 'w') as f:
    json.dump(result_dict, f)

In [19]:
spark.stop()