In [2]:
# import important libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import category_encoders as ce 
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import eli5
from eli5.sklearn import PermutationImportance
from joblib import dump



# import the dataset
df = pd.read_csv('https://raw.githubusercontent.com/oganm/dndstats/master/docs/uniqueTable.tsv',
                 sep='\t')

# create a has spells feature

# create a function that returns true if there is a spell or false if there is not
def check_spells(item):
  # check if the value is a float. This works because np.nan is a float value and the value will be a string if there are spells
  if isinstance(item, float):
    # if nan is found return false (no spells)
    return False
  else:
    # else return true (spell found)
    return True

# apply the function to the dataframe
df['has_spells'] = df['processedSpells'].apply(check_spells)



# create a has feat feature

# create a function that returns true if it has a feat and false if there is not
def check_feats(item):
  if isinstance(item, float):
    return False
  else:
    return True

# apply the function to the dataframe
df['has_feats'] = df['feats'].apply(check_feats)



# create a HP per level feature
df['HP_per_level'] = df['HP'] / df['level']



# drop uneeded columns due to high randomness or high variance
drop_columns_variance = ['name', 'date', 'day']
df = df.drop(columns = drop_columns_variance)

# drop columns due to leakage
drop_columns_leak = ['subclass', 'class']
df = df.drop(columns = drop_columns_leak)

# drop columns due to duplication
drop_columns_dup = ['race', 'alignment', 'good', 'lawful', 'levelGroup']
df = df.drop(columns = drop_columns_dup)

# check the head
df.head()



# get our target
target = 'justClass'
targetAllowed = ['Fighter', 'Rogue', 'Cleric', 'Barbarian', 'Paladin', 'Ranger', 'Sorcerer', 'Wizard', 'Monk', 'Druid', 'Bard', 'Warlock']



# remove targets that do not fit in the 12 basic classes
df = df.loc[df[target].isin(targetAllowed)]



# get our features
train_features = df.drop(columns=target)
numeric_features = train_features.select_dtypes(include='number').columns.tolist()
cardinality = train_features.select_dtypes(exclude='number').nunique()
categorical_features = cardinality[cardinality <= 75].index.tolist()
features = numeric_features + categorical_features



# train test split the data (80/20)
train, val = train_test_split(df, train_size=0.80, test_size=.20,
                               stratify=df[target], random_state=42)



# x matrix y vectors
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]



# create and fit a pipeline for the data
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBClassifier(n_estimators=1000, n_jobs=-1)
)

pipeline.fit(X_train, y_train)



# pickle the pipeline
dump(pipeline, 'pipeline.joblib')

['pipeline.joblib']