<a href="https://colab.research.google.com/github/LennartKeller/TextklassifikationsProjekt2019/blob/master/HyperparamOptimization_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from typing import Union, List

import numpy as np
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from sklearn.metrics import f1_score
from tqdm import tqdm


class PeriodEstimatorWrapper(BaseEstimator):

    def __init__(self, clf: BaseEstimator, **params):
        self.clf = clf(**params)
        if params.get('verbose'):
            self.verbose = params['verbose']

    def fit(self, X_train: Union[csr_matrix, np.ndarray], y_train: np.array):
        """
        Fits the estimator.

        :param X_train: normal feature matrix e.g. shape (n_samples, n_features)
        :param y_train: label vector shape (n_samples,)
        :return: fitted instance of itself
        """

        self.clf.fit(X_train, y_train)
        self.fitted_ = True

        return self

    def predict(self, X_test: List[Union[csr_matrix, np.ndarray]]):
        """
        Predicts classes for n periods
        :param X_test: list of feature matrices (n_samples, n_features) to predict (one for each period)
        :return: list of predicted label vectors
        """

        if not self.fitted_:
            raise NotFittedError

        result = []
        if self.verbose:
            iterator = tqdm(X_test, desc='Predicting classes for periods')
        else:
            iterator = X_test

        for X in iterator:
            result.append(self.clf.predict(X))

        return result

    def predict_proba(self, X_test: List[Union[csr_matrix, np.ndarray]]):
        """
        Predicts probabilities for n periods
        :param X_test: list of feature matrices (n_samples, n_features) to predict (one for each period)
        :return: list of predicted label vectors
        """
        if not hasattr(self.clf, 'predict_proba'):
            raise Exception(f"Method predict_proba is not implemented in {self.clf.__class__.__name__}")

        if not self.fitted_:
            raise NotFittedError

        result = []
        if self.verbose:
            iterator = tqdm(X_test, desc='Predicting classes for periods')
        else:
            iterator = X_test

        for X in iterator:
            result.append(self.clf.predict_proba(X))

        return result

    def decision_function(self, X_test: List[Union[csr_matrix, np.ndarray]]):
        """
        Predicts decision scores for n periods
        :param X_test: list of feature matrices (n_samples, n_features) to predict (one for each period)
        :return: list of predicted label vectors
        """
        if not hasattr(self.clf, 'decision_function'):
            raise Exception(f"Method decision_function is not implemented in {self.clf.__class__.__name__}")

        if not self.fitted_:
            raise NotFittedError

        result = []
        if self.verbose:
            iterator = tqdm(X_test, desc='Predicting classes for periods')
        else:
            iterator = X_test

        for X in iterator:
            result.append(self.clf.predict_proba(X))

        return result

    def score(self,
              X_test: List[Union[csr_matrix, np.ndarray]],
              y_true: List[np.array],
              scoring_func: callable = lambda y_true, y_pred: f1_score(y_true, y_pred, average='macro'),
              pooling_func: callable = np.mean):

        if not self.fitted_:
            raise NotFittedError

        scores = []
        for X, y in zip(X_test, y_true):
            y_pred = self.clf.predict(X)
            score = scoring_func(y, y_pred)
            scores.append(score)

        return pooling_func(scores)


### Problem: Wie tunen wir die Hyperparameter?

Problem: Unsere Idee sieht vor ein Modell auf alle Genres innerhalb einer "Periode" zu trainieren und auf alle anderen anzuwenden, um abzuschätzen wie sehr sich die Genres über die Zeit verändern. Hierbei stellt sich die Frage, wie man die Hyperparameter der Modelle valide und gleichzeitig effektiv optimieren kann.

* Möglichkeit 1:
    * Gridsearch auf Ausgangsperiode
    * Vorteile:
        * Wahrscheinlich am ehesten valide
    * Nachteile:
        * Unsere Datengrundlage ist zu klein, um dass für einzelne Epochen sinnvoll durchzuführen
* Möglichkeit 2:
    * Gridsearch auf allen Daten
    * Vorteile:
        * Große Datenmenge
        * Modell würde auf alle Eigenheiten der Perioden getuned werden (wobei das eher ein Nachteil ist)
    * Nachteil:
        * Spätere Testdaten würden fürs Optimieren verwendet werden
* Möglichkeit 3:
    * ParamDict verwenden, um die den eigentlich Lauf (das Trainieren auf einer Epoche und Testen auf allen Anderen) mit allen möglichen Hyperparamtern zu testen. Eigene Evaulation (bsp. Mittelwert der F1-Scores für die verschiedenen Epochen)
    * Vorteile:
        * Klare Trennung von Test und Trainingsdaten
        * Mehr Daten für die Optimierung als bei Möglichkeit 1
    * Nachteile:
        * keine cross-validation

In [0]:
import pandas as pd

In [13]:
!pip install stop_words

Collecting stop_words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-cp36-none-any.whl size=32916 sha256=40a1030c0ca3eada619d064b0f4bed62c013a5c17daec602aa0d4dec59a5b4d8
  Stored in directory: /root/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [4]:
!ls /content/gdrive/My\ Drive/

full_dataset.csv


In [0]:
df = pd.read_csv('/content/gdrive/My Drive/full_dataset.csv')

In [0]:
# remove news genre
df = df[df.genre != 'NEWS']

In [0]:
df_p1 = df.loc[df['period'] == 'P1']
df_rest = df.loc[df['period'] != 'P1']

# Feature Extraction

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words

tfidf = TfidfVectorizer()

# Bauen der Pipeline

In [0]:
from sklearn.pipeline import make_pipeline, make_union, Pipeline

In [0]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

In [17]:
pipe_svm = Pipeline([('tfidf', tfidf), ('linearsvc', LinearSVC(loss='hinge'))])
pipe_svm

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
         

In [0]:
pipe_svm_params = {
    'tfidf__max_features': [100, 500, 1000, 5000, 10000, 15000, 20000],
    'tfidf__stop_words': [None, get_stop_words('de')],
    'tfidf__analyzer': ['word', 'char', 'char_wb'],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 5), (1, 5)],
    'linearsvc__C': list(range(1,21)),
    'linearsvc__penalty': ['l2']
    
}

# 1. Möglichkeit: Gridsearch auf Trainingsperiode

In [0]:
from sklearn.model_selection import GridSearchCV

gridsearch = GridSearchCV(
    pipe_svm,
    pipe_svm_params,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1)

In [0]:
gridsearch.fit(df_p1.text, df_p1.genre.to_numpy())

Fitting 5 folds for each of 4200 candidates, totalling 21000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   58.5s


In [0]:
gridsearch.best_params_, gridsearch.best_score_

In [0]:
svm_results = pd.DataFrame.from_dict(gridsearch.cv_results_)
svm_results