In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load iris dataset
iris = load_iris()

# Create a pandas dataframe with features and target
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Choose target column
target_col = 'target'

# Decide on the task and metric
# For this example, let's assume we're doing classification and will use accuracy
task = 'classification'
metric = 'accuracy'

# Split the data into train, validation, and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.25, random_state=42)

# Clean the data and exclude any irrelevant features
# In this example, we will not exclude any features and assume the data is clean
features = iris.feature_names

# Get a summary of the numerical features
train[features].describe()

# Get the most common class in the target column
most_common_class = train[target_col].mode()[0]

# Calculate the frequency of the most common class in the training set
frequency = sum(train[target_col] == most_common_class) / len(train)

# Calculate the accuracy score of guessing the most common class
guessing_score = frequency
print('Accuracy score of guessing:', guessing_score)

# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create the model object
model = LogisticRegression(random_state=42)

# Fit the model on the training data
model.fit(train[features], train[target_col])

# Make predictions on the validation data
val_predictions = model.predict(val[features])

# Calculate the accuracy score on the validation data
val_score = accuracy_score(val[target_col], val_predictions)

print('Accuracy score of the logistic regression model:', val_score)

Accuracy score of guessing: 0.34444444444444444
Accuracy score of the logistic regression model: 0.9666666666666667


In [2]:
from xgboost import XGBClassifier

# Create the XGBoost model object
model_xgb = XGBClassifier(n_estimators=100, max_depth=3, random_state=42)

# Fit the model on the training data
model_xgb.fit(train[features], train[target_col])

# Make predictions on the validation data
val_predictions_xgb = model_xgb.predict(val[features])

# Calculate the accuracy score on the validation data
val_score_xgb = accuracy_score(val[target_col], val_predictions_xgb)

print('Accuracy score of the XGBoost model:', val_score_xgb)


Accuracy score of the XGBoost model: 0.9


In [5]:
import eli5
from eli5.sklearn import PermutationImportance

# Create the permutation importance object
perm = PermutationImportance(model, random_state=42)

# Fit the object on the validation data
perm.fit(val[features], val[target_col])

# Print the feature importances
eli5.show_weights(perm, feature_names=features)


Weight,Feature
0.5333  ± 0.0422,petal length (cm)
0.0800  ± 0.0327,petal width (cm)
0.0200  ± 0.0327,sepal length (cm)
0.0067  ± 0.0267,sepal width (cm)
