In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

stats = ["Total", "HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed"]

In [None]:
df = pd.read_csv("Pokemon.csv")

In [None]:
X = df[stats].to_numpy()
y = df["Legendary"].to_numpy().astype(int)

X, y

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, train_size=0.7, random_state=42)
train_idx, test_idx = list(sss.split(X, y))[0]

In [None]:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
logreg.score(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

In [None]:
rf = RandomForestClassifier(max_depth=4, random_state=42)
rf.fit(X_train, y_train)

print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

In [None]:
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

forest_importances = pd.Series(importances, index=stats)

fig, ax = plt.subplots(figsize=(8, 5), tight_layout=True)
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")

In [None]:
result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)
fig, ax = plt.subplots(figsize=(8, 5), tight_layout=True)
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")