<a href="https://colab.research.google.com/github/HSE-LAMBDA/MLatFIAN2020/blob/master/seminar08/MLatFIAN_2020_seminar08_MISC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://github.com/HSE-LAMBDA/MLatFIAN2020/raw/master/seminar01/train.csv

In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv('train.csv', index_col='PassengerId')
data.head()

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=100, random_state=123)
def get_Xy(Xy):
  return Xy.drop('Survived', axis=1), Xy['Survived']

# Combining preprocessors

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer

In [None]:
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknown'),
    OneHotEncoder()
)
numecir_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    RobustScaler()
)

categorical_features = ['Pclass', 'Sex', 'Embarked']
numeric_features = ['Age', 'SibSp', 'Fare']

model = make_pipeline(
    make_column_transformer(
        (categorical_pipeline, categorical_features),
        (numecir_pipeline, numeric_features),
        remainder='drop'
    ),
    XGBClassifier()
)

model.fit(*get_Xy(data_train))
model.score(*get_Xy(data_test))

In [None]:
ohe = model['columntransformer'].named_transformers_['pipeline-1']['onehotencoder']
final_features = [
    f'{feature}_{val}'
    for feature, category_set in zip(categorical_features, ohe.categories_)
    for val in category_set
]
final_features += numeric_features
final_features

In [None]:
print(f'{"importance":12s}      {"feature":15s}')
for i in np.argsort(model[-1].feature_importances_)[::-1]:
  print(f'{model[-1].feature_importances_[i]:12.3f}      {final_features[i]:15s}')

# Permutation importance

In [None]:
!pip install eli5

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
perm = PermutationImportance(model[-1], random_state=1)
X_test, y_test = get_Xy(data_test)
perm.fit(
    model[:-1].transform(X_test),
    y_test
)

eli5.show_weights(perm, feature_names=final_features)

# Working with text features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer(
#    ngram_range=(1, 2)
)

text = [
  "Hello! I'm Jack. What's your name?",
  "Hi! My name is Jill. Pleased to meet you!",
  "Pleased to meet you too!"
]

vec.fit(text)
vec.vocabulary_

In [None]:
columns = np.empty(shape=len(vec.vocabulary_), dtype='object')
for k, v in vec.vocabulary_.items():
  columns[v] = k

pd.DataFrame(vec.transform(text).todense(), columns=columns)

In [None]:
vec = CountVectorizer(
  ngram_range=(3, 4),
  analyzer='char_wb',
  max_features=100
)

vec.fit(get_Xy(data_train)[0]['Name'])

columns = np.empty(shape=len(vec.vocabulary_), dtype='object')
for k, v in vec.vocabulary_.items():
  columns[v] = k

pd.DataFrame(vec.transform(get_Xy(data_train)[0]['Name']).todense(), columns=columns)

In [None]:
model = make_pipeline(
    make_column_transformer(
      (CountVectorizer(
         ngram_range=(3, 4),
         analyzer='char_wb',
         max_features=100
       ), 'Name'),
       remainder='drop'
    ),
    XGBClassifier()
)

model.fit(*get_Xy(data_train))
model.score(*get_Xy(data_test))

In [None]:
eli5.show_weights(model)

In [None]:
name = X_test['Name'].iloc[0]
print(name)

eli5.show_prediction(
    model[-1], name,
    show_feature_values=True, vec=model[0].named_transformers_['countvectorizer'],
)

In [None]:
name = X_test['Name'].iloc[5]
print(name)

eli5.show_prediction(
    model[-1], name,
    show_feature_values=True, vec=model[0].named_transformers_['countvectorizer'],
)

# Numeric + Categorical + Text

In [None]:
all_columns = list(get_Xy(data_train)[0].columns)

categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknown'),
    OneHotEncoder()
)
numecir_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    RobustScaler()
)
text_pipeline = CountVectorizer(
  ngram_range=(3, 4),
  analyzer='char_wb',
  max_features=100
)

categorical_features = ['Pclass', 'Sex', 'Embarked']
numeric_features = ['Age', 'SibSp', 'Fare']
text_features = 'Name'

model = make_pipeline(
    make_column_transformer(
        (categorical_pipeline, [all_columns.index(f) for f in categorical_features]),
        (numecir_pipeline, [all_columns.index(f) for f in numeric_features]),
        (text_pipeline, all_columns.index(text_features)),
        remainder='drop'
    ),
    XGBClassifier()
)

model.fit(*get_Xy(data_train))
model.score(*get_Xy(data_test))

In [None]:
model['columntransformer'].named_transformers_['pipeline-1'].get_feature_names = (
    lambda: [
      f'{feature}_{val}'
      for feature, category_set in zip(
          categorical_features,
          model['columntransformer'].named_transformers_['pipeline-1']['onehotencoder'].categories_
      ) for val in category_set
    ]
)

model['columntransformer'].named_transformers_['pipeline-2'].get_feature_names = (
    lambda: numeric_features
)

In [None]:
obj = X_test.iloc[0]
print(obj['Name'])

eli5.show_prediction(
    model[-1], obj.values,
    show_feature_values=True, vec=model['columntransformer'],
)

# Calibration curve

In [None]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(5, 5), dpi=100)

plt.plot(*calibration_curve(y_test, model.predict_proba(X_test)[:,1], n_bins=5, strategy='quantile'))
plt.plot([0, 1], [0, 1], '--', color='black')
plt.xlabel("fraction of positives")
plt.ylabel("predicted probability");

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
calibrated_model = make_pipeline(
    make_column_transformer(
        (categorical_pipeline, [all_columns.index(f) for f in categorical_features]),
        (numecir_pipeline, [all_columns.index(f) for f in numeric_features]),
        (text_pipeline, all_columns.index(text_features)),
        remainder='drop'
    ),
    CalibratedClassifierCV(XGBClassifier(), cv=3, method='isotonic')
)

calibrated_model.fit(*get_Xy(data_train));

In [None]:
plt.figure(figsize=(5, 5), dpi=100)

plt.plot(*calibration_curve(y_test, model.predict_proba(X_test)[:,1], n_bins=5, strategy='quantile'),
         label='before calibration')
plt.plot(*calibration_curve(y_test, calibrated_model.predict_proba(X_test)[:,1], n_bins=5, strategy='quantile'),
         label='after calibration')
plt.plot([0, 1], [0, 1], '--', color='black')
plt.xlabel("fraction of positives")
plt.ylabel("predicted probability");
plt.legend();