In [1]:
import os
import sys
import pandas as pd
import re

import pymystem3
import operator
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())


from pyspark.sql.types import *
import pyspark.sql.functions as f

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.3
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [7]:
spark.stop()

# READ

In [3]:
path = '/labs/lab07data/DO_record_per_line.json'

In [4]:
#!hdfs dfs -cat /labs/lab07data/DO_record_per_line.json | head -n 1

In [5]:
schema = StructType(fields=[
    StructField("cat", StringType()),
    StructField("desc", StringType()),
    StructField("id", IntegerType()),
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("provider", StringType())
])

In [6]:
df = spark.read.json(path=path, schema=schema)

In [7]:
df.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [8]:
@f.pandas_udf(StringType())
def norm_desc(desc):
    return desc.apply(lambda x: re.sub('[^a-zа-я0-9 ]', '', x.lower()))

In [9]:
norm_text_df = df.withColumn('norm_desc', norm_desc("desc")).toPandas()

In [10]:
norm_text_df.head()

Unnamed: 0,cat,desc,id,lang,name,provider,norm_desc
0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network,this course introduces the basic financial sta...
1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,this online course will introduce you to ameri...
2,5/computer_science|15/mathematics_statistics_a...,This course is taught in French Vous voulez co...,6,fr,Arithmétique: en route pour la cryptographie,Canvas Network,this course is taught in french vous voulez co...
3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,we live in a digitally connected world the way...
4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,this selfpaced course is designed to show that...


In [11]:
def text_to_wordlist(text):
    words = text.lower().split()
    return words

In [12]:
norm_text_df['norm_desc_v1'] = norm_text_df['norm_desc'].apply(lambda x: text_to_wordlist(x))

In [13]:
#в этот словарь записываются результаты
sim = {}

In [15]:
#из личтного кабинета
ids = [21058, 25420, 20332, 889, 6061, 27687]

In [17]:
for idx in en_ids:
    curr_id = idx
    #определяем язык
    curr_lang = norm_text_df[norm_text_df['id']==curr_id]['lang'].values[0]
    #берем из основного датафрейма только курсы с языком curr_lang
    tmp_df = norm_text_df[norm_text_df['lang']==curr_lang][['id', 'lang', 'norm_desc_v1', 'name']]
    tmp_df['bag_of_words_text'] = tmp_df['norm_desc_v1'].apply(lambda x: str(' '.join(x)).strip())

    v = TfidfVectorizer(max_features=10000)
    x = v.fit_transform(tmp_df['bag_of_words_text'])
    matrix_counts = x.toarray()
    words = [x[0] for x in sorted(v.vocabulary_.items(), key=lambda x: x[1])] 
    tfidf_df = pd.DataFrame(matrix_counts, columns=words)
    
    cs = cosine_similarity(tfidf_df)
    cos_matr = pd.DataFrame(cs)
    cos_matr['id'] = tmp_df['id'].values
    new_cos_matr = cos_matr.set_index('id').T
    new_cos_matr['id']  = tmp_df['id'].values
    new_cos_matr = new_cos_matr.set_index('id')
    new_cos_matr['name'] = tmp_df['name'].values
    
    top = new_cos_matr[['name', curr_id]].sort_values(by=[curr_id, 'name', 'id'], ascending=False)[1:11]
    
    #записали результат в словарь
    sim['{}'.format(curr_id)] = list(top[curr_id].index)
    print('finish {}'.format(idx))

finish 21058
finish 25420


In [None]:
#не забыть потом название файла поменять на lab07s.json, чтобы сдать суперачивку

import json
with open('lab07.json', 'w') as outfile:
    json.dump(fin_json, outfile)