Import the database

In [14]:
import sqlite3
import pandas as pd
import os

db_path = os.path.join('..', 'database', 'smite_players.db')
conn = sqlite3.connect(db_path)
c = conn.cursor()

# Execute a query to retrieve all records
c.execute("SELECT * FROM combined_data5")

# Get the column names
column_names = [description[0] for description in c.description]

# Fetch all the results
rows = c.fetchall()

# Check if there are any results
print(f"Number of records retrieved: {len(rows)}")

# Convert the results into a pandas DataFrame
if rows:
    df = pd.DataFrame(rows, columns=column_names)
    print("DataFrame created successfully.")
else:
    print("No records found.")


Number of records retrieved: 262314
DataFrame created successfully.


## Trying differents models

Preprocessing

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping

# Function to split and ensure each list has 3 elements
def split_tags(tags):
    split = tags.split(', ') if pd.notna(tags) else [None, None, None]
    while len(split) < 3:
        split.append(None)
    return split[:3]

# Step 1: Split the tag columns into 3 components
df[['character_tag_1', 'character_tag_2', 'character_tag_3']] = pd.DataFrame(df['character_tags'].apply(split_tags).tolist(), index=df.index)
df[['enemy_1_tag_1', 'enemy_1_tag_2', 'enemy_1_tag_3']] = pd.DataFrame(df['enemy_1_tags'].apply(split_tags).tolist(), index=df.index)
df[['enemy_2_tag_1', 'enemy_2_tag_2', 'enemy_2_tag_3']] = pd.DataFrame(df['enemy_2_tags'].apply(split_tags).tolist(), index=df.index)
df[['enemy_3_tag_1', 'enemy_3_tag_2', 'enemy_3_tag_3']] = pd.DataFrame(df['enemy_3_tags'].apply(split_tags).tolist(), index=df.index)

# Step 2: Encode categorical variables
categorical_columns = [
    'character_name', 'enemy_1_character', 'enemy_2_character', 'enemy_3_character',
    'character_class_distance', 'character_type_dmg', 'character_type_dmgform',
    'character_tag_1', 'character_tag_2', 'character_tag_3',  
    'enemy_1_class_distance', 'enemy_1_type_dmg', 'enemy_1_type_dmgform',
    'enemy_1_tag_1', 'enemy_1_tag_2', 'enemy_1_tag_3',  
    'enemy_2_class_distance', 'enemy_2_type_dmg', 'enemy_2_type_dmgform',
    'enemy_2_tag_1', 'enemy_2_tag_2', 'enemy_2_tag_3',  
    'enemy_3_class_distance', 'enemy_3_type_dmg', 'enemy_3_type_dmgform',
    'enemy_3_tag_1', 'enemy_3_tag_2', 'enemy_3_tag_3'
]

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Read processed items from file
with open("items_processed.txt", "r", encoding="utf-8") as file:
    items_procesados = [line.strip() for line in file]

# Step 1: Extract all unique items
all_items = df['build'].str.split(', ').explode().unique()

# Create binary columns only for permitted items
for item in all_items:
    df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)

# Step 2: Define predictor variables (X) and target variable (y)
X = df_encoded[df_encoded.columns[df_encoded.columns.str.startswith(tuple(categorical_columns))]]
df['damage'] = pd.to_numeric(df['damage'].str.replace(',', ''), errors='coerce')
df['damage_taken'] = pd.to_numeric(df['damage_taken'].str.replace(',', ''), errors='coerce')
df['damage_mitigated'] = pd.to_numeric(df['damage_mitigated'].str.replace(',', ''), errors='coerce')

# Calculate build score
df['build_score'] = df['damage']

build_columns = df.columns[df.columns.isin(all_items)]

# Add build columns to predictor variables X
X = pd.concat([X, df[build_columns]], axis=1)

  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] = df['build'].apply(lambda x: 1 if item in x.split(', ') else 0)
  df[item] =

Training

NN

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Input, LeakyReLU
from tensorflow.keras.optimizers import Adam, RMSprop, Nadam
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import make_scorer
from bayes_opt import BayesianOptimization
import tensorflow as tf

# Define the RMSE function
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

# Create the RMSE scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df['build_score'], test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# List of optimizers and activation functions more stable for regression
optimizerL = ['Adam', 'RMSprop', 'Nadam']
activationL = ['relu', 'tanh', 'selu', 'elu', LeakyReLU]

# Set the hyperparameter search space
params_nn2 = {
    'neurons': (10, 100),
    'activation': (0, 4),  # Corresponds to the length of activationL
    'optimizer': (0, 2),   # Corresponds to the length of optimizerL
    'learning_rate': (0.0001, 0.01),  # Conservative range for stability
    'batch_size': (512, 2048),
    'epochs': (20, 100),
    'layers1': (1, 3),
    'layers2': (1, 3),
    'normalization': (0, 1),
    'dropout': (0, 1),
    'dropout_rate': (0, 0.3)
}

# Function to build the neural network model using RMSE
def nn_cl_bo2(neurons, activation, optimizer, learning_rate, batch_size, epochs, layers1, layers2, normalization, dropout, dropout_rate):
    # Round hyperparameters
    neurons = int(round(neurons))
    activation = activationL[int(round(activation))]
    optimizer_name = optimizerL[int(round(optimizer))]
    batch_size = int(round(batch_size))
    epochs = int(round(epochs))
    layers1 = int(round(layers1))
    layers2 = int(round(layers2))
    
    # Model building function
    def nn_cl_fun():
        # Create an instance of the optimizer for each iteration
        if optimizer_name == 'Adam':
            opt = Adam(learning_rate=learning_rate)
        elif optimizer_name == 'RMSprop':
            opt = RMSprop(learning_rate=learning_rate)
        elif optimizer_name == 'Nadam':
            opt = Nadam(learning_rate=learning_rate)

        nn = Sequential()
        nn.add(Input(shape=(X_train.shape[1],)))
        nn.add(Dense(neurons, activation=activation))
        if normalization > 0.5:
            nn.add(BatchNormalization())
        for _ in range(layers1):
            nn.add(Dense(neurons, activation=activation))
        if dropout > 0.5:
            nn.add(Dropout(dropout_rate))
        for _ in range(layers2):
            nn.add(Dense(neurons, activation=activation))
        nn.add(Dense(1, activation='linear'))  # Linear output for regression
        nn.compile(loss='mean_squared_error', optimizer=opt, metrics=[tf.keras.metrics.RootMeanSquaredError()])
        return nn
    
    # Configure EarlyStopping
    es = EarlyStopping(monitor='val_root_mean_squared_error', mode='min', verbose=1, patience=20)
    
    # Define the model as KerasRegressor for regression using build_fn
    nn = KerasRegressor(build_fn=nn_cl_fun, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[es])
    kfold = KFold(n_splits=5, shuffle=True, random_state=123)
    
    # Perform cross-validation
    score = cross_val_score(
    nn, X_train, y_train,
    scoring=rmse_scorer,
    cv=kfold,
    fit_params={'validation_split': 0.2, 'callbacks': [es]}
      ).mean()
    return -score  # Invert the sign for maximization in Bayesian Optimization

# Run Bayesian Optimization
nn_bo = BayesianOptimization(nn_cl_bo2, params_nn2, random_state=111)
nn_bo.maximize(init_points=25, n_iter=4)


|   iter    |  target   | activa... | batch_... |  dropout  | dropou... |  epochs   |  layers1  |  layers2  | learni... |  neurons  | normal... | optimizer |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m1.858e+04[39m | [39m4.897    [39m | [39m335.3    [39m | [39m0.4361   [39m | [39m0.2308   [39m | [39m43.63    [39m | [39m1.298    [39m | [39m1.045    [39m | [39m0.426    [39m | [39m31.48    [39m | [39m0.3377   [39m | [39m6.935    [39m |


KeyboardInterrupt: 

In [None]:
# Extract the best parameters from the optimization
params_nn_ = nn_bo.max['params']

# Lists of activation functions and optimizers used in the optimization
activationL = ['relu', 'tanh', 'selu', 'elu', LeakyReLU]
optimizerL = ['Adam', 'RMSprop', 'Nadam']

# Round and adjust the hyperparameters
learning_rate = params_nn_['learning_rate']
params_nn_['activation'] = activationL[round(params_nn_['activation'])]
params_nn_['batch_size'] = round(params_nn_['batch_size'])
params_nn_['epochs'] = round(params_nn_['epochs'])
params_nn_['layers1'] = round(params_nn_['layers1'])
params_nn_['layers2'] = round(params_nn_['layers2'])
params_nn_['neurons'] = round(params_nn_['neurons'])

# Optimizer dictionary with the best learning rate
optimizerD = {
    'Adam': Adam(learning_rate=learning_rate),
    'RMSprop': RMSprop(learning_rate=learning_rate),
    'Nadam': Nadam(learning_rate=learning_rate)
}

# Select the optimizer based on the stored name
params_nn_['optimizer'] = optimizerD[optimizerL[round(params_nn_['optimizer'])]]

# Display the best parameters configured
print("Best parameters obtained by Bayesian optimization:")
for param, value in params_nn_.items():
    print(f"{param}: {value}")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasRegressor  # Changed to KerasRegressor for regression

# Define the model function using the best parameters
def nn_cl_fun():
    nn = Sequential()
    nn.add(Input(shape=(X_train.shape[1],)))  # Adjust according to the number of features in X_train
    nn.add(Dense(params_nn_['neurons'], activation=params_nn_['activation']))
    
    # Add Batch Normalization if the normalization parameter requires it
    if params_nn_['normalization'] > 0.5:
        nn.add(BatchNormalization())
    
    # Add hidden layers as specified in layers1
    for i in range(params_nn_['layers1']):
        nn.add(Dense(params_nn_['neurons'], activation=params_nn_['activation']))
    
    # Add Dropout if the dropout parameter requires it
    if params_nn_['dropout'] > 0.5:
        nn.add(Dropout(params_nn_['dropout_rate']))
    
    # Add hidden layers as specified in layers2
    for i in range(params_nn_['layers2']):
        nn.add(Dense(params_nn_['neurons'], activation=params_nn_['activation']))
    
    # Output layer, adjusted for regression
    nn.add(Dense(1, activation='linear'))  # Linear output for regression
    
    # Compile the model with the RMSE metric
    nn.compile(loss='mean_squared_error', optimizer=params_nn_['optimizer'], metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return nn

# Configure EarlyStopping
es = EarlyStopping(monitor='val_root_mean_squared_error', mode='min', verbose=1, patience=20, restore_best_weights=True)

# Define the model with KerasRegressor for training
nn = KerasRegressor(model=nn_cl_fun, epochs=params_nn_['epochs'], batch_size=params_nn_['batch_size'], verbose=1, callbacks=[es])

# Train the model on training and validation sets
nn.fit(X_train, y_train, validation_data=(X_test, y_test), verbose=1)


In [34]:
# Save the trained model
nn.save('nn_build_score_model.h5')



CatBoosting

In [5]:
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['build_score'], test_size=0.2, random_state=42)

# Further split the training set into training and validation sets
X_train_train, X_train_validation, y_train_train, y_train_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 25% of 80% is 20% of the total

# Define the objective function for Optuna
def objective(trial):
    # Suggest values for the hyperparameters
    iterations = trial.suggest_int('iterations', 500, 2000)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.1)
    depth = trial.suggest_int('depth', 4, 10)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1.0, 10.0)
    bagging_temperature = trial.suggest_float('bagging_temperature', 0.0, 1.0)
    
    # Create the CatBoost model with the suggested hyperparameters
    catboost_model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        bagging_temperature=bagging_temperature,
        random_state=42,
        verbose=0
    )
    
    # Train the model on the training set
    catboost_model.fit(X_train_train, y_train_train)
    
    # Make predictions on the validation set
    y_pred_val = catboost_model.predict(X_train_validation)
    
    # Calculate the RMSE on the validation set
    rmse = mean_squared_error(y_train_validation, y_pred_val, squared=False)
    
    return rmse

# Create an Optuna study to minimize RMSE
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Retrieve the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train the model with the best hyperparameters on the original full training set
best_catboost_model = CatBoostRegressor(**best_params, random_state=42, verbose=0)
best_catboost_model.fit(X_train, y_train)

# Make predictions with the best model on the test set
y_pred_best = best_catboost_model.predict(X_test)

# Evaluate the model's performance on the test set
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
print(f"Root Mean Squared Error CatBoost (tuned model): {rmse_best}")



[I 2024-10-13 15:57:09,115] A new study created in memory with name: no-name-2aa2ac5c-53dd-475e-b73a-a7c23d7b9316
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.1)
[I 2024-10-13 15:57:26,101] Trial 0 finished with value: 6348.636099331196 and parameters: {'iterations': 688, 'learning_rate': 0.004362539893743521, 'depth': 5, 'l2_leaf_reg': 3.6785580176922448, 'bagging_temperature': 0.04988310999057444}. Best is trial 0 with value: 6348.636099331196.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.1)
[I 2024-10-13 15:57:54,591] Trial 1 finished with value: 5978.76707134543 and parameters: {'iterations': 856, 'learning_rate': 0.006788169013473108, 'depth': 7, 'l2_leaf_reg': 8.99372247907845, 'bagging_temperature': 0.7147870160999024}. Best is trial 1 with value: 5978.76707134543.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 0.1)
[I 2024-10-13 15:59:11,562] Trial 2 finished with value: 5560.29322423045 and parameters: {'iteratio

Mejores hiperparámetros: {'iterations': 1983, 'learning_rate': 0.09434483707156406, 'depth': 10, 'l2_leaf_reg': 2.542819560694613, 'bagging_temperature': 0.23798239433578128}
Root Mean Squared Error CatBoost (modelo ajustado): 5057.267510624712




Random Forest

In [8]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['build_score'], test_size=0.2, random_state=42)

# Further split the training set into training and validation sets
X_train_train, X_train_validation, y_train_train, y_train_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 4, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # Adjustment here

    # Create the RandomForest model with the suggested hyperparameters
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    # Train the model on the training set
    rf_model.fit(X_train_train, y_train_train)
    
    # Make predictions on the validation set
    y_pred_val = rf_model.predict(X_train_validation)
    
    # Calculate RMSE on the validation set
    rmse = mean_squared_error(y_train_validation, y_pred_val, squared=False)
    
    return rmse

# Create an Optuna study for minimizing RMSE
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Retrieve the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Train the model with the best hyperparameters on the full original training set
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Make predictions with the best model on the test set
y_pred_best = best_rf_model.predict(X_test)

# Evaluate the model
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
print(f"Root Mean Squared Error Random Forest (tuned model): {rmse_best}")



[I 2024-10-13 19:12:56,645] A new study created in memory with name: no-name-9bd57ba5-0aed-4a53-9663-7f4d1c04004e
[I 2024-10-13 19:15:00,543] Trial 0 finished with value: 7313.929923720392 and parameters: {'n_estimators': 884, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 7313.929923720392.
[I 2024-10-13 19:15:36,158] Trial 1 finished with value: 7508.545435191777 and parameters: {'n_estimators': 306, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 7313.929923720392.
[I 2024-10-13 20:04:58,445] Trial 2 finished with value: 6241.7001646774415 and parameters: {'n_estimators': 644, 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': None}. Best is trial 2 with value: 6241.7001646774415.
[I 2024-10-13 20:06:17,206] Trial 3 finished with value: 7594.256849399809 and parameters: {'n_estimators': 763, 'max_depth': 6, 'min_sample

KeyboardInterrupt: 