Production examples of a model
---------------------------

1.

```python

from string import lower

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, make_union

from some_library import ChangeCounter, TextStats, LanguageModelScore
from some_library import (ApplyFunction, MissingValuesFiller,
                          PandasSelector, make_pandas_categorical_vectorizer)
from some_library import Densify

"""
Data preparation and validation:
1. Put selector as a first transformer to make meaningful errors during
   calling
2. Fill missing values
3. Convert src_lang and dst_lang to lowercase

Features:
1. Character ngrams of src_text and dst_text
2. Word ngrams (with high frequency) of src_text and dst_text
3. TextStats - many different measures to compare src_text and dst_text
4. One hot encoding of categorical features: src_lang, dst_lang, category

Model:
1. RandomForestsClassifier
"""

classifier = make_pipeline(
    PandasSelector(columns=['category', 'src_lang', 'dst_lang',
                            'src_text', 'dst_text']),
    MissingValuesFiller(),
    ApplyFunction(columns=['src_lang', 'dst_lang'], fun=lower),

    # here we start adding features
    make_union(
        make_pipeline(
            PandasSelector(columns=['src_text']),
            CountVectorizer(analyzer='char',
                            ngram_range=(1, 1),
                            min_df=10)
        ),
        make_pipeline(
            PandasSelector(columns=['dst_text']),
            CountVectorizer(analyzer='char',
                            ngram_range=(1, 1),
                            min_df=10)
        ),
        make_pipeline(
            PandasSelector(columns=['src_text']),
            CountVectorizer(analyzer='word',
                            ngram_range=(1, 1),
                            min_df=25)
        ),
        make_pipeline(
            PandasSelector(columns=['dst_text']),
            CountVectorizer(analyzer='word',
                            ngram_range=(1, 1),
                            min_df=25)
        ),
        make_pandas_categorical_vectorizer(
            columns=['src_lang', 'dst_lang', 'category']
        ),
        TextStats()
    ),

    # densify makes RandomForestClassifier much faster
    Densify(),
    RandomForestClassifier(
        n_estimators=100,
        n_jobs=-1,
        min_samples_split=20, min_samples_leaf=10,
        verbose=True,
        random_state=1)
)```

2. Simpler one
---------------

```python
model_pipeline = make_pipeline(
    PandasSelector(columns=['prospect_details']),
    MapTransformer(func=clean_text_from_object, n_jobs=6),
    MapTransformer(func=JSONFeatureExtractor(prefix=["prospect_details"], string_processors=[YearMonthProcessor()]), n_jobs=6),
    DictVectorizer(),
    ColumnSparsityFilter(min_nnz=25),
    XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, silent=0)
)
```

3. Again more complicated
------------------

```python
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer
from xgboost import XGBClassifier

from ooo_classifier.pipeline.map_transformer import MapTransformer
from ooo_classifier.pipeline.text_analytics import (afinn, cut_beginning, stem,
                                                    textblob_sentiment_polarity)


def create_pipeline():
    pipeline = make_pipeline(
        make_union(
            # beginning of the message sentiment
            make_pipeline(
                MapTransformer(func=lambda x: x[:100], n_jobs=-1, reshape_2d=False),
                make_union(
                    make_pipeline(MapTransformer(func=textblob_sentiment_polarity, reshape_2d=True, n_jobs=-1), Normalizer()),
                    make_pipeline(MapTransformer(func=afinn, reshape_2d=True, n_jobs=-1), Normalizer())
                )
            ),

            # beginning of the message bag of words
            make_pipeline(
                MapTransformer(func=stem, n_jobs=-1),
                MapTransformer(func=cut_beginning, n_jobs=-1),
                make_union(
                    TfidfVectorizer(min_df=5, ngram_range=(1, 3), binary=True),
                    TfidfVectorizer(min_df=5, analyzer="char_wb", ngram_range=(1, 4), binary=True),
                )
            ),

            # whole message bag of words
            make_pipeline(
                MapTransformer(func=stem, n_jobs=-1),
                make_union(
                    TfidfVectorizer(min_df=5, binary=True),
                    TfidfVectorizer(min_df=5, analyzer="char_wb", ngram_range=(1, 4), binary=True),
                )
            ),
        ),
        VotingClassifier([
            ('mlp1', MLPClassifier(activation="tanh", alpha=0.1, max_iter=100,
                                   hidden_layer_sizes=(50,), verbose=False)),
            ('xgb1', XGBClassifier(max_depth=3, n_estimators=200, min_child_weight=5)),
        ], voting="soft")
    )
    return pipeline
```

# 4. out of office classifier

```python
    pipeline = make_pipeline(

        make_union(
            make_pipeline(
                PandasSelector(columns=["raw_data"]),
                EmailHeadersParse(columns=["subject", "content-type", "precedence", "auto-submitted",
                                           "auto-response-suppress", "autoreply", "return-path",
                                           "delivered-to", "auto-response-suppress", "ms-has-attach",
                                           "failed-recipients", "cc", "from"]),
                make_union(
                    make_pipeline(
                        PandasSelector(columns=["subject"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["content-type"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["precedence"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["auto-submitted"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["auto-response-suppress"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["autoreply"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["return-path"]),
                        MapTransformer(return_path_transform, reshape_2d=True)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["delivered-to"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["auto-response-suppress"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["ms-has-attach"]),
                        CountVectorizer(binary=True, min_df=5)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["failed-recipients"]),
                        MapTransformer(len, reshape_2d=True)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["cc"]),
                        MapTransformer(len, reshape_2d=True)
                    ),
                    make_pipeline(
                        PandasSelector(columns=["from"]),
                        CountVectorizer(binary=True, min_df=10)
                    ),
                )
            ),

            make_pipeline(
                PandasSelector(columns=["raw_data"]),
                CountVectorizer(binary=True, min_df=5)
            ),

            make_pipeline(
                PandasSelector(columns=["message_body_plain"]),

                make_union(
                    # beginning of the message sentiment
                    make_pipeline(
                        MapTransformer(func=cut_beginning, n_jobs=-1, reshape_2d=False),
                        make_union(
                            MapTransformer(func=textblob_sentiment_polarity, reshape_2d=True, n_jobs=-1),
                            MapTransformer(func=afinn, reshape_2d=True, n_jobs=-1)
                        )
                    ),

                    # beginning of the message bag of words
                    make_pipeline(
                        MapTransformer(func=stem, n_jobs=-1),
                        MapTransformer(func=cut_beginning, n_jobs=-1),
                        make_union(
                            CountVectorizer(min_df=5, ngram_range=(1, 2), binary=True),
                            CountVectorizer(min_df=5, analyzer="char_wb", ngram_range=(1, 2), binary=True),
                        )
                    ),

                    # whole message bag of words
                    make_pipeline(
                        MapTransformer(func=stem, n_jobs=-1),
                        make_union(
                            CountVectorizer(min_df=5, binary=True),
                            CountVectorizer(min_df=5, analyzer="char_wb", ngram_range=(1, 2), binary=True),
                        )
                    )
                ),
            ),
        ),
        StandardScaler(with_mean=False),
        ReportShape(),
        est
    )
```

Growbots Recruitment challenge

https://github.com/GSzpak/recruitment-challenge

Especially this fragment

https://github.com/GSzpak/recruitment-challenge/blob/master/src/json_transformer/run_transform_jsons.py