# Dzień 2

### - Feature Selection
### - Pipelines
### - GridSearch / Hyper Parameter Selection
### - Cross Validation
### - Random Forest
### - XGBoost
---

# Feature Selection

In [None]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import numpy as np
plt.style.use("dark_background")


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine

wine_data = load_wine()
wine_df = pd.DataFrame(
    data=wine_data.data, 
    columns=wine_data.feature_names)
wine_df['target'] = wine_data.target

In [None]:
wine_df

In [None]:
from sklearn.model_selection import train_test_split

X = wine_df.drop(['target'], axis=1)
y = wine_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    shuffle=True, 
                                                    stratify=y)


### - `shuffle` - dane mają być losowo "potasowane"
### - `stratify`  - poszczególne klasy mają być reprezentowane proporcjonalnie w zbiorze testowym


In [None]:
X_train.var(axis=0)

In [None]:
from sklearn.preprocessing import Normalizer
norm = Normalizer().fit(X_train)
norm_X_train = norm.transform(X_train)
norm_X_train.var(axis=0)

### Eliminacja kolumn o niskiej zmienności

In [None]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold = 1e-6)
selected_features = selector.fit_transform(norm_X_train)
selected_features.shape

## "Ręczne" zrzucanie kolumn

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train, y_train)

preds = dt.predict(X_test)
f1_score_all = round(f1_score(y_test, preds, average='weighted'),3)

X_train_sel = X_train.drop(['hue', 'nonflavanoid_phenols'], axis=1)
X_test_sel = X_test.drop(['hue', 'nonflavanoid_phenols'], axis=1)
dt.fit(X_train_sel, y_train)
preds_sel = dt.predict(X_test_sel)
f1_score_sel = round(f1_score(y_test, preds_sel, average='weighted'), 3)
f1_score_sel

## Zrzucanie kolumn testem $\chi^2$

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_train_v2, X_test_v2, y_train_v2, y_test_v2 = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
f1_score_list = []
for k in range(1, 14):
    selector = SelectKBest(chi2, k=k)
    selector.fit(X_train_v2, y_train_v2)
    
    sel_X_train_v2 = selector.transform(X_train_v2)
    sel_X_test_v2 = selector.transform(X_test_v2)
    
    dt.fit(sel_X_train_v2, y_train_v2)
    kbest_preds = dt.predict(sel_X_test_v2)
    f1_score_kbest = round(f1_score(y_test, kbest_preds, average='weighted'), 3)
    f1_score_list.append(f1_score_kbest)

print(f1_score_list)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
x = list(range(0,13))
y = f1_score_list
ax.bar(x, y, width=0.4)
ax.set_xlabel('Ilość wymiarów wybranych testem chi2')
ax.set_ylabel('F1-Score (weighted)')
ax.set_ylim(0, 1.2)
for index, value in enumerate(y):
    plt.text(x=index, y=value + 0.05, s=str(value), ha='center')
    
plt.tight_layout()

## Syntetyczne dane klasyfikacyjne

In [None]:
from sklearn.datasets import make_classification

X, y= make_classification(n_samples=1000, n_features=20, n_informative=5, n_redundant=5, n_repeated=5, n_classes=2)

pd.DataFrame(X)

---
# <span style="color: magenta">Ćwiczenie - w powyższym zbiorze jest 5 przydatnych kolumn. Zastosować mechanizmy selekcji wymiarów do redukcji w/w tabeli do 5 wymiarów</span>
---

## Recursive Feature Elimination

### - korzystając z osobnego estymatora posiadającego `coef_` lub `feature_importance_` iteracyjnie odrzucane są kolejne wagi

In [None]:
from sklearn.feature_selection import RFE

X_train_v3, X_test_v3, y_train_v3, y_test_v3 = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
RFE_selector = RFE(estimator=dt, n_features_to_select=4, step=1)
RFE_selector.fit(X_train_v3, y_train_v3)

In [None]:
X_train_v3.columns[RFE_selector.support_]

In [None]:
sel_X_train_v3 = RFE_selector.transform(X_train_v3)
sel_X_test_v3 = RFE_selector.transform(X_test_v3)
dt.fit(sel_X_train_v3, y_train_v3)
RFE_preds = dt.predict(sel_X_test_v3)
rfe_f1_score = round(f1_score(y_test_v3, RFE_preds, average='weighted'),3)
print(rfe_f1_score)

## Select from model

### - korzystając z osobnego estymatora posiadającego `coef_` lub `feature_importance_` iteracyjnie odrzucane są kolejne wagi

In [None]:
from sklearn.feature_selection import SelectFromModel

X_train_v4, X_test_v4, y_train_v4, y_test_v4 = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()

sfm_selector = SelectFromModel(estimator=DecisionTreeClassifier())
sfm_selector.fit(X_train_v4,  y_train_v4)

In [None]:
X.columns[sfm_selector.get_support()]

In [None]:
sel_X_train_v4 = sfm_selector.transform(X_train_v4)
sel_X_test_v4 = sfm_selector.transform(X_test_v4)

dt.fit(sel_X_train_v4, y_train_v4)
sfm_preds = dt.predict(sel_X_test_v4)
sfm_f1_score = round(f1_score(y_test_v4, sfm_preds, average='weighted'),3)
print(rfe_f1_score)

---
# <span style="color: magenta">Ćwiczenie - w klasyfikatorze spamu z wczoraj, zredukować ilość wymiarów w `DecisionTreeClassifier`. </span>

# <span style="color: magenta">Ćwiczenie - Wyświetlić tak uproszczone drzewo. </span>
---

# Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

ppln_clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC())),
  ('classification', DecisionTreeClassifier())
])
ppln_clf.fit(X_train, y_train)

In [None]:
ppln_preds = ppln_clf.predict(X_test)
ppln_f1_score = round(f1_score(y_test, ppln_preds, average='weighted'),3)
ppln_f1_score

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

ppln_clf = Pipeline([
  ('feature_selection', SelectKBest(chi2, k=4)),
  ('classification', DecisionTreeClassifier())
])
ppln_clf.fit(X_train, y_train)

In [None]:
ppln_preds = ppln_clf.predict(X_test)
ppln_f1_score = round(f1_score(y_test, ppln_preds, average='weighted'),3)
ppln_f1_score

---
# <span style="color: magenta">Ćwiczenie - Przerobić kod z poprzedniego ćwiczenia w `pipeline` </span>
---

In [None]:
import pandas as pd

data = pd.read_csv('data/adverts_29_04.csv', sep=';')
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
df

In [None]:
dum_df = pd.get_dummies(df, columns=['Lokalizacja', 'Na sprzedaż przez', 'Rodzaj nieruchomości', 'Liczba pokoi', 'Liczba łazienek', 'Parking'])
dum_df

In [None]:
dum_df.columns

In [None]:
import matplotlib.pyplot as plt

plt.hist(dum_df['cena_za_metr'], 50,
         density=True,
         histtype='bar',
         facecolor='b',
         alpha=0.5)

plt.show()

In [None]:
dum_df.drop(dum_df[dum_df.cena_za_metr >30000.0].index, inplace=True)

In [None]:
import matplotlib.pyplot as plt

plt.hist(dum_df['cena_za_metr'], 50,
         density=True,
         histtype='bar',
         facecolor='b',
         alpha=0.5)

plt.show()

In [None]:
dum_df.corr()['cena_za_metr']

## Mapa ciepła korelacji parametrów

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.figure(figsize=(20,15))
plt.style.use("dark_background")

sns.heatmap(dum_df.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);

In [None]:
plt.hist(dum_df['Wielkość (m2)'], 50,
         density=True,
         histtype='bar',
         facecolor='b',
         alpha=0.5)

plt.show()

In [None]:
dum_df.drop(dum_df[dum_df['Wielkość (m2)'] >300.0].index, inplace=True)
dum_df

---
# Regresja

In [None]:
from sklearn.linear_model import LinearRegression

y = dum_df['cena_za_metr']
X = dum_df.drop(['opis', 'cena_za_metr'], axis=1)

reg = LinearRegression().fit(X, y)

In [None]:
reg.score(X, y)

# $R^2$

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)https://en.wikipedia.org/wiki/Coefficient_of_determination

Współczynnik determinacji - Jaka część wariancji zmiennej objaśnianej jest pochodzi od zmiennych tłumaczących

- 1.0 - Idealnie dopasowania
- 0.0 - Funkcja stała
- ... ale może być i ujemna

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

---
## Walidacja krzyżowa

![Walidacja krzyżowa](img\xvi.png)

https://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)
scores = cross_val_score(LinearRegression(), X_train, y_train, cv=10)
print(list(scores))
print()
print("Mean r^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Zmiana techniki scoringu

In [None]:
from sklearn.metrics import mean_squared_error, make_scorer


scores = cross_val_score(LinearRegression(), X_train, y_train, scoring=make_scorer(mean_squared_error), cv=5)
print(list(scores))
print()
print("Mean square error: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Zmiana regresora

In [None]:
from sklearn.svm import SVR

scores = cross_val_score(SVR(kernel='linear', C=1000), X_train, y_train, cv=5)
print(list(scores))
print()
print("Mean r^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.svm import SVR

scores = cross_val_score(SVR(kernel='linear', C=10), X_train, y_train, cv=5)
print(list(scores))
print()
print("Mean r^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.svm import SVR

scores = cross_val_score(SVR(kernel='linear', C=100), X_train, y_train, cv=5)
print(list(scores))
print()
print("Mean r^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.svm import SVR

scores = cross_val_score(SVR(kernel='rbf', C=1000), X_train, y_train, cv=5)
print(list(scores))
print()
print("Mean r^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.svm import SVR

scores = cross_val_score(GradientBoostingRegressor(), X_train, y_train, cv=10)
print(list(scores))
print()
print("Mean r^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Hiperparametry
### `C` - współczynnik regularyzacji - odwrotność siły regularyzacji czyli __*"czynienia rozwiązania prostszym"*__


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.style.use("dark_background")
plt.figure(figsize=(10,6))
x = np.linspace(-0.5, 2, 100)
plt.plot(x, x*(5*x-1)*(x-2))
y = np.linspace(0, 1.4, 3)
plt.scatter(y, -3.4*y, color="#FF00FF");
z = np.linspace(-0.5, 2, 50)
plt.plot(z, -3.4*z-0.3);

## GridSearch elementów `Pipeline`

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from warnings import simplefilter
simplefilter(action='ignore', category=DeprecationWarning)

pipe = Pipeline([
    ('scale',  'passthrough'),
    ('regression', SVR())
])



In [None]:
pipe.set_params(regression__C=10)

In [None]:
param_grid = dict(regression__C=[0.1, 10, 100])

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe, param_grid, verbose=1, cv=3)
grid_search

## Scalers
- StandardScaler (standardyzacja - odejmuję średnia, dzieli przez wariancję)
- Normalizer (normalizacja - dzieli przez długość - sprowadza do wektora o normie 1)
- RobustScaler (odejmuje medianę i skaluje kwartylami)

In [None]:
from time import time

param_grid = dict( 
    scale=['passthrough', StandardScaler(), Normalizer()],
    regression__C=[ 10, 100],
    regression__kernel=['linear']
)
                  
print(param_grid)

###################################

grid_search = GridSearchCV(pipe, param_grid, verbose=1, cv=5, n_jobs=2)

###################################

t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()
print("Best parameters set:")
print(grid_search.best_estimator_)
print()
print(f"Best score: {grid_search.best_score_}")

- 5 - Walidacja krzyżowa
- 2 - parametry $C$
- 3 - scaler

$ 5 * 2 * 3 = 30 $ przebiegów

---
# Random Forest Classifier

https://www.kaggle.com/datasets/yufengsui/portuguese-bank-marketing-data-set

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
bank_data = pd.read_csv('data/bank-full.csv', sep=';')
bank_data

## Binaryzacja 

In [None]:
bank_data['default'] = bank_data['default'].map({'no':0,'yes':1,'unknown':0})

bank_data['y'] = bank_data['y'].map({'no':0,'yes':1})
bank_data['housing'] = bank_data['housing'].map({'no':0,'yes':1})
bank_data['loan'] = bank_data['loan'].map({'no':0,'yes':1})
bank_data

## Konwersja zmiennych kategorycznych

In [None]:
bank_data = pd.get_dummies(bank_data, columns=['job', 'marital', 'education', 'contact', 'month', 'poutcome'])
bank_data

In [None]:
X = bank_data.drop('y', axis=1)
y = bank_data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## Wiele estymatorów

In [None]:
 rf.estimators_

In [None]:

for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

## `RandomizedSearchCV`

In [None]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rf = RandomForestClassifier()

rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5, 
                                 cv=5, n_jobs=3)

rand_search.fit(X_train, y_train)

## Najlepszy estymator

In [None]:
best_rf = rand_search.best_estimator_

print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
y_pred = best_rf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm).plot();

---
# XGBoost

conda install -c conda-forge xgboost

In [None]:
!conda install -c conda-forge xgboost

## <span style="color: cyan">- Numeric features should be scaled </cyan>
## <span style="color: cyan">- Categorical features should be encoded </cyan>

https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package
    

In [None]:
import pandas as pd

rain = pd.read_csv("data/weatherAUS.csv")
rain

In [None]:
cols_to_drop = ["Date", "Location", "RainTomorrow", "Rainfall"]

rain.drop(cols_to_drop, axis=1, inplace=True)

## Usuwamy kolumny z dużą ilością `NaN`

In [None]:
missing_props = rain.isna().mean(axis=0)

over_threshold = missing_props[missing_props >= 0.4]

In [None]:
rain.drop(over_threshold.index, 
          axis=1, 
          inplace=True)

In [None]:
rain['RainToday'] = rain['RainToday'].map({'No':0,'Yes':1})

In [None]:
X = rain.drop("RainToday", axis=1)
y = rain.RainToday

In [None]:
y

## Uzupełniamy braki w kategorycznych kolumnach

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

In [None]:
## Uzupełniamy braki w kolumnach liczbowych

In [None]:
from sklearn.preprocessing import StandardScaler

numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

In [None]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

In [None]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

In [None]:
X_processed = full_processor.fit_transform(X)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=1121218
)

In [None]:
from sklearn.metrics import accuracy_score

xgb_cl = xgb.XGBClassifier()

xgb_cl.fit(X_train, y_train)

preds = xgb_cl.predict(X_test)

accuracy_score(y_test, preds)


## Grid Search

https://xgboost.readthedocs.io/en/stable/parameter.html

- `eta` / `learning_rate` - redukcja kroku aby uniknąć przetrenowania - `[0,1]`
- `gamma` - minimalny strata potrzebna do kolejnego kroku. Większy parametr - algorytm jest bardziej konserwatywny - `[0, Infty]`
- `max_depth` - maksymalna głębokość drzewa
- `subsample` - jaka część danych jest brana do trenowania przy kolejnych krokach
- `sampling_method` - jaka strategia samplingu
- `colsample_*` - rodzina parametrów regulująca subsampling
- `lambda` / `reg_lambda` - waga cech. Większy parametr - algorytm jest bardziej konserwatywny - `[0, Infty]`
- `scale_pos_weight` - ważenie niezbalansowanych klas

https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html

#### Sterowanie złożonością modelu:
    - Złożonośc modelu: `max_depth`, `gamma`, `min_child_weight`
    - Wprowadzenie szumu: `subsample`, `colsample_by_tree`
    - Tempo uczenia: `eta` , `num_round`

In [None]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [None]:
from sklearn.model_selection import GridSearchCV

xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

_ = grid_cv.fit(X_processed, y_processed)

In [None]:
 grid_cv.best_score_

----

In [None]:
grid_cv.best_params_

# <span style="color: magenta">Ćwiczenie - dobrać jeszcze lepsze parametry (dla tych z końców zakresu) </span>

# <span style="color: magenta">Ćwiczenie - stworzyć klasyfikator w oparciu o ulepszone parametry. Podać jego skuteczność na zbiorze testowym.</span>


---

# Pipeline dla tekstu

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 

data = pd.read_csv('data/adverts_29_04.csv', sep=';')
data['cena_za_metr'] = data['Cena'] / data['Wielkość (m2)']
data = data.dropna(subset=['cena_za_metr'])
df = data.drop(['Cena', 'Data dodania'], axis=1)
df

tfidf = TfidfVectorizer()
tfs = tfidf.fit_transform(df["opis"])
tfs

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


pipeline = Pipeline([
                ('selector', ItemSelector(key='opis')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD(n_components=250)),
                ('linear', LinearRegression())
            ])

y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
scores = cross_val_score(pipeline, X_train, y_train, cv=3)
print(list(scores))
print()
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from time import time
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

parameters = parameters = {
    'best__n_components': (250,),
    'svr__C': (100, 1000),
    'svr__kernel':('linear', 'rbf')
}

pipeline = Pipeline([
                ('selector', ItemSelector(key='opis')),
                ('tfidf', TfidfVectorizer()),
                ('best', TruncatedSVD()),
                ('svr', SVR())
            ])

grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=3, n_jobs=2)


y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=17)

t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from operator import itemgetter

vectorizer = TfidfVectorizer(min_df=2)

X = vectorizer.fit_transform(X_train['opis'])



In [None]:
import gzip
import sys
import re
import re

splitter = re.compile(r'[^ąąćęńłóóśśżżź\w]+')
isnumber = re.compile(r'[0-9]')

f = gzip.open('data/odm.txt.gz', 'rt', encoding='utf-8')
dictionary = {}
set_dict= set()

for x in f:
    t = x.strip().split(',')
    tt = [ x.strip().lower() for x in t]
    for w in tt:
        set_dict.add(w)
        dictionary[w]=tt[0]

def lematize(w):
    w = w.replace('ą','ą')
    w = w.replace('ó','ó')
    w = w.replace('ę','ę')
    w = w.replace('ż','ż')
    return dictionary.get(w,w)

opis1 = dum_df['opis'][0]



raw_corpus=[]
n=0

for i in dum_df.iterrows():
    n+=1
    l = list(splitter.split(i[1][1]))
    raw_corpus.append(l)

    
all_words = []
for t in raw_corpus:
    all_words[0:0] = t

words = {}
for w in all_words:
    rec = words.get(w.lower(), {'upper':0, 'lower': 0})
    if w.lower()==w or w.upper()==w:
        rec['lower'] = rec['lower'] +1
    else: 
        rec['upper'] = rec['upper'] +1
    words[w.lower()] = rec

raw_stop_words = [ x for x in words.keys() if words[x]['upper']>=words[x]['lower']*4 ]   

set_raw_stop_words = set(raw_stop_words)



In [None]:
raw_stop_words[:20]

In [None]:
def preprocessing(opis, filter_raw=True, filter_dict=True):
    opis = str(opis)
    tokenized = splitter.split(opis)
    l = list(tokenized)
    l = [ x.lower() for x in l ]
    l = [ x for x in l if len(x) > 2]
    l = [ x for x in l if x.find('_') < 0]
    l = [ x for x in l if isnumber.search(x) is None ]
    if filter_raw: l = [ x for x in l if x not in set_raw_stop_words ]
    if filter_dict: l = [ x for x in l if x in set_dict ]
    l = [ lematize(x) for x in l ]
    l = [ x for x in l if len(x) > 2]
    return l

In [None]:
opis1

In [None]:
print(preprocessing(opis1))

In [None]:
print(preprocessing(opis1, filter_raw=False))

In [None]:
print(preprocessing(opis1, filter_dict=False))

In [None]:
print(preprocessing(opis1, filter_raw=False, filter_dict=False))

In [None]:
dum_df["opisTT"] = dum_df["opis"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=True, filter_dict=True)))
dum_df["opisTF"] = dum_df["opis"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=True, filter_dict=False)))
dum_df["opisFT"] = dum_df["opis"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=False, filter_dict=True)))
dum_df["opisFF"] = dum_df["opis"].apply(lambda x: ' '.join(preprocessing(x,filter_raw=False, filter_dict=False)))

---
# Pipeline kompozytowy

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key=''):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class ItemUnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, keys=[]):
        self.keys = keys

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict.drop(self.keys, axis=1)


pipeline = Pipeline([
   ('union', 
        FeatureUnion(
            transformer_list=[
                ('table', 
                    Pipeline([
                        ('selector1', ItemUnSelector(keys=['opis', 'opisTT', 'opisTF', 'opisFT', 'opisFF'])),
                        ('scaler1', 'passthrough')
                    ])
                ),
                ('description', 
                    Pipeline([
                        ('selector2', ItemSelector()),
                        ('tfidf', TfidfVectorizer()),
                        ('best', TruncatedSVD()),
                        ('scaler2', 'passthrough')
                    ])
                )
            ]
        )    

   ),
   ('regressor', 
        TransformedTargetRegressor()
    )
])

parameters = parameters = {
    'union__transformer_weights': [  { 'table': 1.0, 'description': 1.0}],

    'union__description__best__n_components': (700,),
    'union__description__tfidf__min_df': (3,),
    'union__description__tfidf__binary': (True,),
    'union__description__selector2__key': [ 'opisFF'] ,
    
    'union__table__scaler1': [ RobustScaler()],
    'union__description__scaler2': [ RobustScaler(with_centering=False)],
    
    'regressor': [ GradientBoostingRegressor()] ,
}

grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=10, n_jobs=4)


y = dum_df['cena_za_metr']
X = dum_df.drop(['cena_za_metr'], axis=1)

t0 = time()
grid_search.fit(X, y)
print("done in %0.3fs" % (time() - t0))

print(f'Best score: {grid_search.best_score_}')

print("Best parameters set:")
print()
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

---
## Problemy z trenowaniem modelu


### To ile tych prób możemy mieć ?

- 3 zestawy wag `union`
- 6 zestawów wymiarów SVD
- 6 zestawów parametrów TF-IDF
- 4 zbiory danych tekstowych
- 4 mechanizmy skalowania części `table`
- 4 mechanizmy skalowania części `description`
- 3 regresory
- 10 walidacji krzyżowych

In [None]:
3*6*6*4*4*4*3*10

powiedzmy - że 3 na minutę

In [None]:
3*6*6*4*4*4*3*10/3/60/24

---

# Podsumowanie

- ## Inżynieria cech i przygotowanie danych to zestaw arbitralnych decyzji
- ## Te decyzje muszą być walidowane
- ## __*Inżynieria cech*__ jest częścią modelu
- ## Uwaga na <span style="color: red">Eksplozję kombinatoryczną</span>

# Bibliografia
- ## [Efficient and Robust Automated Machine Learning](https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf) - __*Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter*__ Advances in Neural Information Processing Systems 28 (2015)