## Uwaga
Wersja zgodna z masterem z repo post_extractor. Przed odpaleniem trzeba zrobić checkout

In [2]:
import json

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import ArrayType, IntegerType, DoubleType

from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



from post_extractor.modules.posts import (
    SentenceTransformer,
    PostTransformer,
    TranslateTransformer,
    SpeechPartsTransformer,
    SentimentTransformer
)
from post_extractor.modules.features_ import (
    FeatureTransformer
)
from post_extractor.modules.universal import (
    ConvertDictToListTransformer,
    SelectRecordsTransformer,
    MaxTransformer,
    MeanTransformer,
    MedianTransformer,
    NumberOfOccurrencesTransformer,
)

sconf = SparkConf()              \
    .setMaster('local[*]')       \
    .setAppName('PipelineFlow')

sc = SparkContext.getOrCreate(sconf)
sess = SparkSession(sc)
sqlContext = SQLContext(sc)
    

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Mateusz\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
from pyspark.ml.param import Param
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml import Transformer
class TransformerProxy(Transformer):

    def __init__(self):
        super(TransformerProxy, self).__init__()
        self.transformer = Param(self, "transformer", "")

    def set_transformer(self, transformer):
        self._paramMap[self.transformer] = transformer
        return self

    def get_transformer(self):
        return self.getOrDefault(self.transformer)

    def _transform(self, dataset):
        return self.get_transformer().transform(dataset)

In [61]:
def load_data(spark_ctx, root):
    posts_rdd = spark_ctx.wholeTextFiles(root + 'posts')
    posts_rdd = posts_rdd.map(lambda x: (x[0].split('/')[-1].rstrip('.json'), json.loads(x[1])))
    posts_df = posts_rdd.toDF(['key', 'content_post'])

    features_rdd = spark_ctx.wholeTextFiles(root + 'features')
    features_rdd = features_rdd.map(lambda x: (x[0].split('/')[-1].rstrip('.features'), x[1]))
    features_df = features_rdd.toDF(['key', 'content_features'])
    
    
    return posts_df.join(features_df, 'key')

In [144]:
features_choices = [["leaf", "has-attribute-class",], ["contains-adjectives", "contains-date"]]

featurer = FeatureTransformer();
featurer.setInputCol('content_features').setOutputCol('features')

feature_selector = SelectRecordsTransformer(keys=features_choices[0], element_type=ArrayType(DoubleType()))
feature_selector.setInputCol(featurer.getOutputCol()).setOutputCol('selected_features')

aggregated_features = 'aggregated_features'

max_feature_transformer = MaxTransformer()
max_feature_transformer.setInputCol(feature_selector.getOutputCol()).setOutputCol(aggregated_features)

mean_feature_transformer = MeanTransformer()
mean_feature_transformer.setInputCol(feature_selector.getOutputCol()).setOutputCol(aggregated_features)

median_feature_transformer = MedianTransformer()
median_feature_transformer.setInputCol(feature_selector.getOutputCol()).setOutputCol(aggregated_features)

number_of_occurences_feature_transformer = NumberOfOccurrencesTransformer()
number_of_occurences_feature_transformer.setInputCol(feature_selector.getOutputCol()).setOutputCol(aggregated_features)

feature_aggregation_proxy = TransformerProxy()
feature_aggregation_transformers = [
    max_feature_transformer,
    mean_feature_transformer,
    median_feature_transformer,
    number_of_occurences_feature_transformer,
]

features_dict_to_list_converter = ConvertDictToListTransformer(keys=features_choices[0], element_type=DoubleType())
features_dict_to_list_converter.setInputCol(aggregated_features).setOutputCol('features_from_file')

features_stages = [
    featurer,
    feature_selector,
    feature_aggregation_proxy,
    features_dict_to_list_converter
]


In [145]:
poster = PostTransformer()
poster.setInputCol('content_post').setOutputCol('posts')

translator = TranslateTransformer()
translator.setInputCol('posts').setOutputCol('translated')

sentencer = SentenceTransformer()
sentencer.setInputCol('translated').setOutputCol('sentences')

speech_parter = SpeechPartsTransformer()
speech_parter.setInputCol('translated').setOutputCol('speechParts')

sentimenter = SentimentTransformer()
sentimenter.setInputCol('translated').setOutputCol('sentiments')

tags = [
    'NN',
    'NNS',
    'NNPS'
]

aggregated_nouns_col = 'aggregated_nouns'
nouns_col = 'nouns'

speech_parts_selector = SelectRecordsTransformer(keys=tags, element_type=ArrayType(IntegerType()))
speech_parts_selector.setInputCol(speech_parter.getOutputCol()).setOutputCol('nouns')

max_nouns_transformer = MaxTransformer()
max_nouns_transformer.setInputCol(speech_parts_selector.getOutputCol()).setOutputCol(aggregated_nouns_col)

mean_nouns_transformer = MeanTransformer()
mean_nouns_transformer.setInputCol(speech_parts_selector.getOutputCol()).setOutputCol(aggregated_nouns_col)

median_nouns_transformer = MedianTransformer()
median_nouns_transformer.setInputCol(speech_parts_selector.getOutputCol()).setOutputCol(aggregated_nouns_col)

post_aggregation_proxy = TransformerProxy()
post_aggregation_transformers = [max_nouns_transformer, mean_nouns_transformer, median_nouns_transformer]

posts_dict_to_list_converter = ConvertDictToListTransformer(keys=tags, element_type=DoubleType())
posts_dict_to_list_converter.setInputCol(aggregated_nouns_col).setOutputCol('post_features')

post_stages = [
    poster,
    translator, 
    sentencer, 
    speech_parter,
    sentimenter,
    speech_parts_selector,
    post_aggregation_proxy,
    posts_dict_to_list_converter
]

In [146]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT
class DenseVectorTransformer(Transformer, HasInputCol, HasOutputCol):
    def __init__(self):
        super(DenseVectorTransformer, self).__init__()
    def _transform(self, dataset):
        toDenseVector = udf(lambda arr: Vectors.dense(arr), VectorUDT())
        return dataset.withColumn(self.getOutputCol(), toDenseVector(self.getInputCol()))
    
features_dv = DenseVectorTransformer().setInputCol(features_dict_to_list_converter.getOutputCol()).setOutputCol('features_dv')
posts_dv = DenseVectorTransformer().setInputCol(posts_dict_to_list_converter.getOutputCol()).setOutputCol('posts_dv')
hotfix_stages = [features_dv, posts_dv]

In [147]:
all_features = [
    features_dv.getOutputCol(),
    posts_dv.getOutputCol()
]
vector_assembler = VectorAssembler(inputCols=all_features, outputCol='feature_vector')

classifier = DecisionTreeClassifier(featuresCol=vector_assembler.getOutputCol())

classification_stages = [vector_assembler, classifier]

In [148]:
pipeline = Pipeline(stages = features_stages + post_stages + hotfix_stages + classification_stages)

param_grid = ParamGridBuilder() \
    .addGrid(feature_aggregation_proxy.transformer, feature_aggregation_transformers) \
    .addGrid(post_aggregation_proxy.transformer, post_aggregation_transformers) \
    .build()

evaluator = MulticlassClassificationEvaluator()

In [149]:
cross_validator = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=param_grid, 
    evaluator=evaluator)

### Wyniki:

In [151]:
from pyspark.sql.functions import rand, floor
input_data = load_data(sc, 'data/').withColumn('label', floor(rand() * 3).cast(DoubleType()))

In [None]:
cross_validator.fit(input_data)