# Dzień 3

---
## Shapley values

W 2017  Lundberg and Lee opublikowali artykuł "A Unified Approach to Interpreting Model Predictions".
https://proceedings.neurips.cc/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf
    
Połaczyli "Shapley values" (1951, nagroda Nobla z ekonomii 2012) z innym mechanizmami wyjaśniania modeli tworząc SHAP values (SHapley Additive exPlanations) i odpowiadającą im bibliotekę
`shap`.

https://en.wikipedia.org/wiki/Shapley_value


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import shap

In [None]:
import pandas as pd

rain = pd.read_csv("data/weatherAUS.csv")
rain

In [None]:
cols_to_drop = ["Date", "Location", "RainTomorrow", "Rainfall"]

rain.drop(cols_to_drop, axis=1, inplace=True)

## Usuwamy kolumny z dużą ilością `NaN`

In [None]:
missing_props = rain.isna().mean(axis=0)

over_threshold = missing_props[missing_props >= 0.4]

In [None]:
rain.drop(over_threshold.index, 
          axis=1, 
          inplace=True)

In [None]:
rain['RainToday'] = rain['RainToday'].map({'No':0,'Yes':1})

In [None]:
X = rain.drop("RainToday", axis=1)
y = rain.RainToday

In [None]:
X

## Uzupełniamy braki w kategorycznych kolumnach ... ręcznie

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

orig_columns = X.columns

X = (X
      .apply(
            lambda x: x.fillna(x.value_counts().index[0])
      )
      .pipe(pd.get_dummies, X.select_dtypes(exclude="number").columns)
    )
X

## Pipeline ze skalowaniem

In [None]:
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    steps=[("scale", StandardScaler()), ("classifier", xgb.XGBClassifier())]
)

In [None]:
from sklearn.model_selection import train_test_split


y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_processed, stratify=y_processed, random_state=1121218
)



In [None]:
X_train

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline["classifier"]

In [None]:
from sklearn.metrics import accuracy_score



preds = pipeline.predict(X_test)

accuracy_score(y_test, preds)


In [None]:
import shap

shap.initjs()

explainer = shap.TreeExplainer(pipeline["classifier"])
shap_values = explainer.shap_values(X_train)



In [None]:
df = pd.DataFrame(X_train)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[4776], features=df.loc[4776], feature_names=X.columns)

In [None]:
shap.summary_plot(shap_values, features=X_train, feature_names=X_train.columns, plot_type="violin", axis_color="#FFFFFF")

In [None]:
shap.summary_plot(shap_values, features=X_train, feature_names=X_train.columns, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, features=X_train, feature_names=X_train.columns, plot_type="dot")

In [None]:
shap.decision_plot(explainer.expected_value, features=X_train, shap_values= shap_values[4776], feature_names=list(X.columns))

In [None]:
df_shap = pd.DataFrame(shap_values)
df_shap

In [None]:
X_train.columns

In [None]:
prefixes = [	'WindGustDir'	,'WindDir9am', 'WindDir3pm',]


In [None]:
df_shap.columns = X_train.columns
df_shap

In [None]:
to_merge = [ (key,[ n for n in X_train.columns if n.find(key)>-1]) for key in prefixes]
to_merge

In [None]:
df_shap.columns

In [None]:
for key, cols in to_merge:
    df_shap[key] = df_shap[cols].apply(sum, axis=1)
    df_shap.drop(cols, axis=1, inplace=True)
    
df_shap

In [None]:
def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [None]:
X_train_undummified = undummify(X_train)
X_train_undummified

In [None]:
shap.summary_plot(df_shap, features=X_train_undummified, feature_names=X_train_undummified.columns, plot_type="bar")

---
# <div style="color: magenta">Ćwiczenie - stworzyć rozwiązanie klasyfikacyjne dla dataset Covertype. </div>
    
- ## Wyeliminować zbędne kolumny, 
- ## przeprowadzić Feature Engineering, 
- ## zbudować pipeline, 
- ## wytrenować model, 
- ## skorzystać z Grid Search do optymalizacji Hiperparametrów, 
- ## wyjaśnić model korzystając z Shapley Values. 

In [None]:
from sklearn import datasets

https://archive.ics.uci.edu/ml/datasets/Covertype

In [None]:
cover = datasets.fetch_covtype(as_frame=True)

In [None]:
print(cover["DESCR"])

In [None]:
cover.data

In [None]:
cover.target