In [12]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings(action='ignore')

In [13]:
class_to_int = {
     "tiny": 1,
     "small" : 2,
     "medium" : 3,
     "large" : 4,
     "huge" : 5,
     "gargantuan" : 6,
}
def preprocess_df(df_orig, flag_type, flag_unique, flag_else):
    df = df_orig.copy()
    for stat in ['Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha']:
        df[stat] = (df[f'{stat} Mod'] + df[f'{stat} Save']) // 2 + df['PB']
        df.drop(columns=[f'{stat} Mod', f'{stat} Save', f'{stat} Prof'], inplace=True)
    if flag_unique:
        df = df[(df['Legendary'] != 'N') | (df['Category'] != 'unique')]
    df.drop(columns = ['Book', 'Page', 'Category','XP', 'PB', 'AC', 'HP', 'AB', 'DPR', 'Legendary',
                       'Restrained', 'Deafened', 'Frightened', 'Petrified', 'adj DPR', 'adj AB'], inplace=True)
    df.rename(columns={'adj AC': 'AC', 'adj HP': 'HP'}, inplace=True)
    df['Save DC'] = df['Save DC'].fillna(1)
    if flag_else==2:
        df['Size'] = df['Size'].map(class_to_int)
    else:
        df = pd.get_dummies(df, columns=['Size'], dtype=int, drop_first=flag_else)
    df['Saves quantity'] = df['AC adjustments'].apply(func=extract_saves_quantity)
    df['Saves quantity'] = df['Saves quantity'].fillna(0)
    mlb = MultiLabelBinarizer()
    df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('AC adjustments')), index=df.index, columns=mlb.classes_))
    df = pd.get_dummies(df, columns=['Type'], dtype=int, drop_first=flag_type)
    df = df.convert_dtypes()
    return df
def extract_saves_quantity(dflist):
    for n in range(1,7):
        quan = f'{n} saves'
        if quan in dflist:
            dflist.remove(quan)
            return n

In [14]:
df = pd.read_csv('/kaggle/input/monsters-dataset/monsters (1).csv', converters={'AC adjustments': literal_eval})
df

Unnamed: 0,Book,Page,Type,Size,Category,Legendary,CR,PB,XP,Str Mod,...,AC,adj AC,HP,adj HP,AC adjustments,AB,adj AB,DPR,adj DPR,Save DC
0,SKT,188.0,humanoid,medium,generic,N,0.125,2,25.0,0.0,...,12.0,12.0,6.0,6.0,[],4.0,4.0,5.50000,6.666666,
1,MM,12.0,humanoid,medium,generic,N,0.250,2,50.0,0.0,...,12.0,12.0,13.5,13.5,[],4.0,4.0,5.50000,6.666666,
2,SAiS:BAM,8.0,plant,large,generic,N,3.000,2,700.0,4.0,...,16.0,16.0,75.0,75.0,[],,,,,
3,SAiS:BAM,9.0,plant,medium,generic,N,2.000,2,450.0,1.0,...,14.0,14.0,52.0,52.0,[],,,,,
4,SAiS:BAM,9.0,plant,medium,generic,N,2.000,2,450.0,2.0,...,15.0,15.0,38.5,38.5,[],,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,VRGtR,255.0,undead,medium,generic,N,4.000,2,1100.0,3.0,...,10.0,11.0,78.0,85.0,[],5.0,5.0,31.00000,31.000000,12.0
1900,ToA,241.0,monstrosity,small,generic,N,0.500,2,100.0,1.0,...,10.0,12.0,27.0,27.0,"[magic resistance, shield]",3.0,3.0,8.00000,8.000000,
1901,MPMotM,282.0,fiend,large,unique,LL,23.000,7,50000.0,6.0,...,18.0,22.0,304.0,548.8,"[3 saves, magic resistance]",13.0,13.0,144.00000,144.000000,
1902,MToF,157.0,fiend,large,legacy,LL,23.000,7,50000.0,6.0,...,18.0,22.0,304.0,548.8,"[3 saves, magic resistance]",13.0,13.0,168.00000,168.000000,


In [15]:
params = [(0,0,0), (0,0,1), (0,0,2), (0,1,0), (0,1,1), (0,1,2), (1,0,0), (1,0,1), (1,0,2), (1,1,0),
        (1,1,1), (1,1,2)]
best_mse = 1000000
best_setup = ''
for param in params:
    print('===================================')
    arr1 = ['one-hot Type;', 'dummy Type;']
    arr2 = ['all uniq;', 'skip non-L uniq;']
    arr3 = ['one-hot Size;', 'dummy Size;', 'label Size;']
    print(arr1[param[0]], arr2[param[1]], arr3[param[2]])
    df_clean = preprocess_df(df, *param)
    train, test = train_test_split(df_clean, test_size=0.2, random_state=145)
    preprocessors = [StandardScaler(), MinMaxScaler()]
    models = [LinearRegression(), SVR(kernel='linear'), SVR(kernel='poly'), SVR(kernel='rbf'), DecisionTreeRegressor(),
              RandomForestRegressor(), GradientBoostingRegressor(), KNeighborsRegressor(), xgb.XGBRegressor()]
    for prep in preprocessors:
        print('--------------------------------------')
        print(f'Using preprocessor {prep}:')
        for model in models:
            y_train = train['CR']
            x_train = prep.fit_transform(train.drop('CR', axis = 1))
            y_test = test['CR']
            x_test = prep.transform(test.drop('CR', axis = 1))
            model.fit(x_train, y_train)
            y_pred = model.predict(x_test)
            if mean_squared_error(y_pred, y_test) < best_mse:
                best_mse = mean_squared_error(y_pred, y_test)
                best_setup = f'Model: {model}. Preprocessor: {prep}. {arr1[param[0]]} {arr2[param[1]]} {arr3[param[2]]}'
            print(f'Model: {model}')
            print(f'MSE: {mean_squared_error(y_pred, y_test)}')
            print('--------------------------------------')
            
            
print(best_mse, best_setup)

one-hot Type; all uniq; one-hot Size;
--------------------------------------
Using preprocessor StandardScaler():
Model: LinearRegression()
MSE: 1.8238551344554488
--------------------------------------
Model: SVR(kernel='linear')
MSE: 1.814491557931993
--------------------------------------
Model: SVR(kernel='poly')
MSE: 6.409515579171549
--------------------------------------
Model: SVR()
MSE: 4.833213239249398
--------------------------------------
Model: DecisionTreeRegressor()
MSE: 2.120242782152231
--------------------------------------
Model: RandomForestRegressor()
MSE: 0.9796493538750366
--------------------------------------
Model: GradientBoostingRegressor()
MSE: 1.0250315697039243
--------------------------------------
Model: KNeighborsRegressor()
MSE: 5.433211942257218
--------------------------------------
Model: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, devic

In [16]:
df_xgbr = preprocess_df(df, *params[7])
train, test = train_test_split(df_xgbr, test_size=0.2, random_state=145)
prep = StandardScaler()
model = xgb.XGBRegressor()
param_grid = {
    'n_estimators': [120, 130, 140],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.09, 0.1, 0.11],
    'subsample': [0.825, 0.85, 0.875],
    'colsample_bytree': [0.725, 0.75, 0.775],
    'gamma': [0.125, 0.15, 0.175]
}
y_train = train['CR']
x_train = prep.fit_transform(train.drop('CR', axis = 1))
y_test = test['CR']
x_test = prep.transform(test.drop('CR', axis = 1))
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(x_train, y_train)
print(f'Лучшие параметры: {grid_search.best_params_}')
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
print(f'MSE: {mean_squared_error(y_test, y_pred)}')

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Лучшие параметры: {'colsample_bytree': 0.75, 'gamma': 0.15, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 130, 'subsample': 0.85}
MSE: 0.854264524932721
