In [1]:
import nltk, string, json

import pyspark as ps

def tokenize(text):
    tokens = [] 
    
    for word in nltk.word_tokenize(text):
        if word \
            not in nltk.corpus.stopwords.words('english') \
            and word not in string.punctuation \
            and word != '``':    
                tokens.append(word)
    
    return tokens

In [3]:
from pyspark.sql import SparkSession
from collections import Counter

from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF

spark = SparkSession.\
        builder.\
        appName("best_one").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

In [5]:
essay_rdd = spark.sparkContext.textFile('essay_1000.json')
row_rdd = essay_rdd.map(lambda x: json.loads(x))

# tokenize documents
tokenized_rdd = row_rdd.filter(lambda row: row['essay'] and row['essay'] != '') \
                       .map(lambda row: row['essay']) \
                       .map(lambda text: text.replace('\\n', '').replace('\r', '')) \
                       .map(lambda text: tokenize(text))

essay_rdd.take(1)

['{"essay":"\\"I am currently a Special Education Math teacher in a high needs middle school. My students are eager to learn but lack proper classroom resources. I am requesting necessary classroom basics in order to create a supply center. \\r\\\\n\\r\\\\nMy students are very motivated and try their best despite the academic disabilities. In addition to learning the language, my students come from low income homes. All of the students in my school are eligible for free or reduced lunch due to the household incomes. \\r\\\\n\\r\\\\nMy students are classified as emotionally disturbed, academically delayed or learning disabled. They need all the help they can get to get them to the same level as their middle school peers. Most don\'t have the supplies due to the financial strain buying school supplies puts on the family. Having a large supply of pens, pencils, glue sticks, stapler, and other classroom basics, would be a blessing each day. Most of these materials are needed for everyday l

In [None]:
# применяем TF-IDF
hashingTF = HashingTF(numFeatures=50000)
tf = hashingTF.transform(tokenized_rdd)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

## Word2Vec

In [8]:
from pyspark.mllib.feature import Word2Vec
import numpy as np

In [None]:
word2vec = Word2Vec()
model = word2vec.fit(tokenized_rdd)

In [None]:
model.save(spark.sparkContext, 'word2vec_train.model')

In [None]:
# получить вектора для слов
word_vecs = model.getVectors()

In [None]:
word_v = word_vecs['school']

In [None]:
type(word_v)

In [None]:
# функция doc2vec

def doc2vec(document_tup):
    doc_vec = np.zeros(100)
    tot_words = 0
    
    for word in document_tup[0]:
        try:
            weight = document_tup[1][hashingTF.indexOf(word)]
            vec = np.array([ v for v in word_vecs[word] ])
            tot_words += 1
        except:
            continue
            
        doc_vec += weight * vec
        
    return doc_vec / float(tot_words)

In [None]:
# подготовка данных
ex = tokenized_rdd.zip(tfidf).take(1)

In [None]:
# применение
doc2vec(ex[0])

In [None]:
# загрузка всего документа
document_vectors = tokenized_rdd.zip(tfidf).collect()
d2v = [ doc2vec(doc) for doc in document_vectors ]

In [13]:
from scipy.spatial import distance

def query(q, docs):
    '''
    функция похожести (косинусное расстояние)
    определяем расстояние между векторами для поиска
    
    '''
    tf_q = idf.transform(hashingTF.transform(tokenize(q)))
    q_vec = doc2vec((tokenize(q), tf_q))
    similarity = distance.cdist(docs, np.array([q_vec]), 'cosine')
    return np.argsort(similarity[:, 0])[:3]

In [None]:
# пример
query('field trip to aquarium', d2v)

In [None]:
essay_rdd.zipWithIndex().take(1)

In [None]:
# собираем проект
def find_projects(indeces, num):
    q = indeces[:num]
    return essay_rdd.zipWithIndex().filter(lambda x: x[1] in q).collect()

In [None]:
# применение
find_projects(query('computers', d2v), 2)