In [34]:
import pandas as pd
import numpy as np
from typing import Optional, List, Callable, Any, Union, Dict
from itertools import product
from statistics import mean
from pathlib import Path
import gzip
import os

In [35]:
def read_ds_gzip(path: Optional[Path] = None, ds: str = "TRAIN") -> pd.DataFrame:
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with gzip.open(
        f"/kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz"
        if path is None
        else path
    ) as f:
        max_actions = max((len(str(c).split(",")) for c in f.readlines()))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype=str)


def read_ds(path: Optional[Path] = None, ds: str = "TRAIN"):
    """Args:
        path (Optional[Path], optional): the path to read the dataset file. Defaults to /kaggle/input/the-insa-starcraft-2-player-prediction-challenge/{ds}.CSV.gz.
        ds (str, optional): the part to read (TRAIN or TEST), to use when path is None. Defaults to "TRAIN".

    Returns:
        pd.DataFrame:
    """
    with open(
        f"/kaggle/input/train-sc2-keystrokes/{ds}.CSV" if path is None else path
    ) as f:
        max_actions = max((len(str(c).split(",")) for c in f.readlines()))
        f.seek(0)
        _names = ["battleneturl", "played_race"] if "TRAIN" in ds else ["played_race"]
        _names.extend(range(max_actions - len(_names)))
        return pd.read_csv(f, names=_names, dtype=str)

In [36]:
features_train = read_ds_gzip(
    Path(
        os.path.abspath(
            "/Users/milofournier/Documents/Work/INSA/OT2/data mining/in-star-craft-2-player"
        )
    )
    / "TRAIN.CSV.GZ",
    ds="TRAIN",
)
# features_test = read_ds("TEST")
features_train.shape  # , features_test.shape

(3052, 10539)

In [37]:
features_train

Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10527,10528,10529,10530,10531,10532,10533,10534,10535,10536
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,,,
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,,,
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,t5,...,,,,,,,,,,
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,http://xx.battle.net/sc2/en/profile/405/1/MMA/,Terran,s,s,s,s,s,hotkey10,hotkey20,hotkey30,...,,,,,,,,,,
3048,http://xx.battle.net/sc2/en/profile/410/1/STBo...,Terran,s,s,hotkey10,s,hotkey20,s,s,hotkey12,...,,,,,,,,,,
3049,http://xx.battle.net/sc2/en/profile/405/1/MMA/,Terran,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,hotkey50,...,,,,,,,,,,
3050,http://xx.battle.net/sc2/en/profile/410/1/STBo...,Terran,s,s,hotkey10,s,hotkey20,s,s,hotkey12,...,,,,,,,,,,


In [38]:
def first_nan_occurrence(row):
    return row.first_valid_index() if row.isna().all() else row.isna().idxmax()


features_train["first_nan_index"] = features_train.apply(first_nan_occurrence, axis=1)

In [39]:
import pandas as pd
import re


# Function to extract the maximum t value
def max_t_value(row):
    t_values = []
    # Iterate over each column (excluding the first two columns, e.g., 'battleneturl' and 'played_race')
    for value in row[2:]:  # Skip the first two columns
        # Find values starting with 't' followed by digits
        match = re.match(r"t(\d+)", str(value))
        if match:
            t_values.append(
                int(match.group(1))
            )  # Convert to integer and append to the list

    # Return the maximum value found or NaN if no 't' values exist
    return max(t_values, default=None)


# Apply the function across all rows
features_train["max_t_value"] = features_train.apply(max_t_value, axis=1)

In [40]:
features_train["first_nan_index"] = pd.to_numeric(
    features_train["first_nan_index"], errors="coerce"
)

features_train["action_per_sec"] = (
    features_train["first_nan_index"] / features_train["max_t_value"]
)

In [41]:
import pandas as pd

# Data for max_t_value column
data = features_train["max_t_value"].dropna()

# Calculate mean, median, and standard deviation
mean = data.mean()
median = data.median()
std = data.std()

# Calculate 2σ bounds and ensure the lower bound is not negative
lower_bound = max(mean - 2 * std, 0)  # Ensure lower bound is at least 0
upper_bound = mean + 2 * std

# Filter the entire DataFrame based on 'max_t_value' being within the 2σ bounds
filtered_features_train = features_train[
    features_train["max_t_value"].between(lower_bound, upper_bound)
]

filtered_features_train


Unnamed: 0,battleneturl,played_race,0,1,2,3,4,5,6,7,...,10530,10531,10532,10533,10534,10535,10536,first_nan_index,max_t_value,action_per_sec
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,s,s,t5,Base,...,,,,,,,,2563.0,1655.0,1.548640
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,s,Base,s,s,Base,s,s,Base,...,,,,,,,,2408.0,1655.0,1.454985
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,1550.0,1010.0,1.534653
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Protoss,Base,s,s,Base,s,s,s,t5,...,,,,,,,,1589.0,1005.0,1.581095
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Protoss,Base,s,s,s,Base,s,hotkey30,hotkey00,...,,,,,,,,727.0,540.0,1.346296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,http://xx.battle.net/sc2/en/profile/405/1/MMA/,Terran,s,s,s,s,s,hotkey10,hotkey20,hotkey30,...,,,,,,,,2351.0,880.0,2.671591
3048,http://xx.battle.net/sc2/en/profile/410/1/STBo...,Terran,s,s,hotkey10,s,hotkey20,s,s,hotkey12,...,,,,,,,,3686.0,1175.0,3.137021
3049,http://xx.battle.net/sc2/en/profile/405/1/MMA/,Terran,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,hotkey50,...,,,,,,,,3290.0,1170.0,2.811966
3050,http://xx.battle.net/sc2/en/profile/410/1/STBo...,Terran,s,s,hotkey10,s,hotkey20,s,s,hotkey12,...,,,,,,,,2459.0,580.0,4.239655


In [42]:
filtered_features_train["played_protoss"] = (
    filtered_features_train["played_race"] == "Protoss"
).astype(int)
filtered_features_train["played_terran"] = (
    filtered_features_train["played_race"] == "Terran"
).astype(int)
filtered_features_train["played_zerg"] = (
    filtered_features_train["played_race"] == "Zerg"
).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_features_train["played_protoss"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_features_train["played_terran"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_features_train["played_zerg"] = (


In [43]:
filtered_features_train = filtered_features_train.drop(columns=["played_race"])


In [44]:
import re

# Pre-compile regular expressions to save time on repeated pattern matching
patterns = {
    "total_creation_hotkeys_count_per_sec": r"hotkey\d+0",
    "creation_hotkey_1": r"hotkey10",
    "creation_hotkey_2": r"hotkey20",
    "creation_hotkey_3": r"hotkey30",
    "creation_hotkey_4": r"hotkey40",
    "creation_hotkey_5": r"hotkey50",
    "creation_hotkey_6": r"hotkey60",
    "creation_hotkey_7": r"hotkey70",
    "creation_hotkey_8": r"hotkey80",
    "creation_hotkey_9": r"hotkey90",
    "creation_hotkey_0": r"hotkey00",
    "total_update_hotkeys_count_per_sec": r"hotkey\d+1",
    "update_hotkey_1": r"hotkey11",
    "update_hotkey_2": r"hotkey21",
    "update_hotkey_3": r"hotkey31",
    "update_hotkey_4": r"hotkey41",
    "update_hotkey_5": r"hotkey51",
    "update_hotkey_6": r"hotkey61",
    "update_hotkey_7": r"hotkey71",
    "update_hotkey_8": r"hotkey81",
    "update_hotkey_9": r"hotkey91",
    "update_hotkey_0": r"hotkey01",
    "total_use_hotkeys_count_per_sec": r"hotkey\d+2",
    "use_hotkey_1": r"hotkey12",
    "use_hotkey_2": r"hotkey22",
    "use_hotkey_3": r"hotkey32",
    "use_hotkey_4": r"hotkey42",
    "use_hotkey_5": r"hotkey52",
    "use_hotkey_6": r"hotkey62",
    "use_hotkey_7": r"hotkey72",
    "use_hotkey_8": r"hotkey82",
    "use_hotkey_9": r"hotkey92",
    "use_hotkey_0": r"hotkey02",
    "s_per_sec": r"\bs\b",
    "base_per_sec": r"\bBase\b",
    "mineral_per_sec": r"\SingleMineral\b",
}


# Generalized function to count hotkeys based on a given pattern
def count_hotkeys(row, pattern):
    hotkeys = [key for key in row if isinstance(key, str) and re.match(pattern, key)]
    game_length = row["max_t_value"]
    return len(hotkeys) / game_length


# Apply the function to each pattern and assign new columns
for column, pattern in patterns.items():
    filtered_features_train[column] = filtered_features_train.apply(
        lambda row: count_hotkeys(row, pattern), axis=1
    )

filtered_features_train


Unnamed: 0,battleneturl,0,1,2,3,4,5,6,7,8,...,use_hotkey_4,use_hotkey_5,use_hotkey_6,use_hotkey_7,use_hotkey_8,use_hotkey_9,use_hotkey_0,s_per_sec,base_per_sec,mineral_per_sec
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Base,s,s,s,s,s,t5,Base,s,...,0.154079,0.068882,0.010876,0.000000,0.000000,0.000000,0.024773,0.407251,0.039879,0.003021
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,s,Base,s,s,Base,s,s,Base,s,...,0.186103,0.000000,0.000000,0.000000,0.009668,0.010876,0.050151,0.325076,0.007251,0.000000
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Base,s,s,s,Base,s,hotkey30,hotkey00,t5,...,0.128713,0.087129,0.010891,0.000000,0.000000,0.001980,0.017822,0.425743,0.046535,0.002970
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Base,s,s,Base,s,s,s,t5,Base,...,0.282587,0.000000,0.000000,0.000000,0.000000,0.013930,0.038806,0.395025,0.020896,0.000000
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Base,s,s,s,Base,s,hotkey30,hotkey00,t5,...,0.155556,0.025926,0.000000,0.000000,0.000000,0.000000,0.000000,0.385185,0.003704,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,http://xx.battle.net/sc2/en/profile/405/1/MMA/,s,s,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,...,0.488636,0.248864,0.027273,0.000000,0.000000,0.000000,0.020455,0.650000,0.000000,0.004545
3048,http://xx.battle.net/sc2/en/profile/410/1/STBo...,s,s,hotkey10,s,hotkey20,s,s,hotkey12,hotkey22,...,0.234894,0.100426,0.125957,0.057021,0.011064,0.000000,0.000000,0.659574,0.000000,0.024681
3049,http://xx.battle.net/sc2/en/profile/405/1/MMA/,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,hotkey50,hotkey60,...,0.528205,0.249573,0.010256,0.000000,0.000000,0.000000,0.059829,0.769231,0.000000,0.004274
3050,http://xx.battle.net/sc2/en/profile/410/1/STBo...,s,s,hotkey10,s,hotkey20,s,s,hotkey12,hotkey22,...,0.181034,0.077586,0.063793,0.027586,0.000000,0.000000,0.000000,0.779310,0.000000,0.072414


In [45]:
def count_actions_before_t(row, value):
    # Convert the row to a list
    row_list = row.tolist()
    try:
        # Find the index of the first occurrence of the specified value (e.g., "t5")
        t_index = row_list.index(f"{value}")
        return t_index
    except ValueError:
        # If the value is not found, return the total number of elements
        return len(row_list)


# Apply the function to the DataFrame (excluding the 'battleneturl' column)
filtered_features_train["actions_before_t5"] = filtered_features_train.iloc[
    :, 1:
].apply(lambda x: count_actions_before_t(x, "t5"), axis=1)

filtered_features_train["actions_before_t10"] = filtered_features_train.iloc[
    :, 1:
].apply(lambda x: count_actions_before_t(x, "t10"), axis=1)

In [46]:
def max_actions_between_t(row):
    # Convert the row to a list
    row_list = row.tolist()
    # Find all indices of 't' markers (e.g., t5, t10, t15)
    t_indices = [
        i
        for i, val in enumerate(row_list)
        if isinstance(val, str) and val.startswith("t") and val[1:].isdigit()
    ]
    # Initialize the maximum difference
    max_actions = 0
    # Calculate the number of actions between consecutive 't' markers
    for i in range(1, len(t_indices)):
        actions_between = t_indices[i] - t_indices[i - 1] - 1
        max_actions = max(max_actions, actions_between)
    return max_actions


def min_actions_between_t(row):
    # Convert the row to a list
    row_list = row.tolist()
    # Find all indices of 't' markers (e.g., t5, t10, t15)
    t_indices = [
        i
        for i, val in enumerate(row_list)
        if isinstance(val, str) and val.startswith("t") and val[1:].isdigit()
    ]
    # Initialize the minimum difference as a large number
    min_actions = float("inf")
    # Calculate the number of actions between consecutive 't' markers
    for i in range(1, len(t_indices)):
        actions_between = t_indices[i] - t_indices[i - 1] - 1
        min_actions = min(min_actions, actions_between)

    # If no 't' markers are found or only one 't' marker exists, return 0
    if min_actions == float("inf"):
        return 0
    return min_actions


# Apply the function to each row of the DataFrame (excluding the 'battleneturl' column)
filtered_features_train["max_actions_between_t"] = filtered_features_train.iloc[
    :, 1:
].apply(max_actions_between_t, axis=1)

filtered_features_train["min_actions_between_t"] = filtered_features_train.iloc[
    :, 1:
].apply(min_actions_between_t, axis=1)

filtered_features_train

Unnamed: 0,battleneturl,0,1,2,3,4,5,6,7,8,...,use_hotkey_8,use_hotkey_9,use_hotkey_0,s_per_sec,base_per_sec,mineral_per_sec,actions_before_t5,actions_before_t10,max_actions_between_t,min_actions_between_t
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,Base,s,s,s,s,s,t5,Base,s,...,0.000000,0.000000,0.024773,0.407251,0.039879,0.003021,6,11,21,0
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,s,Base,s,s,Base,s,s,Base,s,...,0.009668,0.010876,0.050151,0.325076,0.007251,0.000000,9,21,16,0
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,Base,s,s,s,Base,s,hotkey30,hotkey00,t5,...,0.000000,0.001980,0.017822,0.425743,0.046535,0.002970,8,9,20,0
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,Base,s,s,Base,s,s,s,t5,Base,...,0.000000,0.013930,0.038806,0.395025,0.020896,0.000000,7,15,16,1
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,Base,s,s,s,Base,s,hotkey30,hotkey00,t5,...,0.000000,0.000000,0.000000,0.385185,0.003704,0.000000,8,9,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,http://xx.battle.net/sc2/en/profile/405/1/MMA/,s,s,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,...,0.000000,0.000000,0.020455,0.650000,0.000000,0.004545,14,25,28,2
3048,http://xx.battle.net/sc2/en/profile/410/1/STBo...,s,s,hotkey10,s,hotkey20,s,s,hotkey12,hotkey22,...,0.011064,0.000000,0.000000,0.659574,0.000000,0.024681,10,50,43,0
3049,http://xx.battle.net/sc2/en/profile/405/1/MMA/,s,s,s,hotkey10,hotkey20,hotkey30,hotkey40,hotkey50,hotkey60,...,0.000000,0.000000,0.059829,0.769231,0.000000,0.004274,12,25,28,1
3050,http://xx.battle.net/sc2/en/profile/410/1/STBo...,s,s,hotkey10,s,hotkey20,s,s,hotkey12,hotkey22,...,0.000000,0.000000,0.000000,0.779310,0.000000,0.072414,22,52,43,4


In [47]:
patterns = list(patterns)

simple_data = filtered_features_train[
    [
        "battleneturl",
        "max_t_value",
        "action_per_sec",
        "played_protoss",
        "played_terran",
        "played_zerg",
        "actions_before_t5",
        "actions_before_t10",
        "max_actions_between_t",
        "min_actions_between_t",
    ]
    + patterns
]

simple_data

Unnamed: 0,battleneturl,max_t_value,action_per_sec,played_protoss,played_terran,played_zerg,actions_before_t5,actions_before_t10,max_actions_between_t,min_actions_between_t,...,use_hotkey_4,use_hotkey_5,use_hotkey_6,use_hotkey_7,use_hotkey_8,use_hotkey_9,use_hotkey_0,s_per_sec,base_per_sec,mineral_per_sec
0,http://eu.battle.net/sc2/en/profile/4234852/1/...,1655.0,1.548640,1,0,0,6,11,21,0,...,0.154079,0.068882,0.010876,0.000000,0.000000,0.000000,0.024773,0.407251,0.039879,0.003021
1,http://eu.battle.net/sc2/en/profile/3074362/1/...,1655.0,1.454985,1,0,0,9,21,16,0,...,0.186103,0.000000,0.000000,0.000000,0.009668,0.010876,0.050151,0.325076,0.007251,0.000000
2,http://eu.battle.net/sc2/en/profile/4234852/1/...,1010.0,1.534653,1,0,0,8,9,20,0,...,0.128713,0.087129,0.010891,0.000000,0.000000,0.001980,0.017822,0.425743,0.046535,0.002970
3,http://eu.battle.net/sc2/en/profile/3074362/1/...,1005.0,1.581095,1,0,0,7,15,16,1,...,0.282587,0.000000,0.000000,0.000000,0.000000,0.013930,0.038806,0.395025,0.020896,0.000000
4,http://eu.battle.net/sc2/en/profile/4234852/1/...,540.0,1.346296,1,0,0,8,9,19,0,...,0.155556,0.025926,0.000000,0.000000,0.000000,0.000000,0.000000,0.385185,0.003704,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3047,http://xx.battle.net/sc2/en/profile/405/1/MMA/,880.0,2.671591,0,1,0,14,25,28,2,...,0.488636,0.248864,0.027273,0.000000,0.000000,0.000000,0.020455,0.650000,0.000000,0.004545
3048,http://xx.battle.net/sc2/en/profile/410/1/STBo...,1175.0,3.137021,0,1,0,10,50,43,0,...,0.234894,0.100426,0.125957,0.057021,0.011064,0.000000,0.000000,0.659574,0.000000,0.024681
3049,http://xx.battle.net/sc2/en/profile/405/1/MMA/,1170.0,2.811966,0,1,0,12,25,28,1,...,0.528205,0.249573,0.010256,0.000000,0.000000,0.000000,0.059829,0.769231,0.000000,0.004274
3050,http://xx.battle.net/sc2/en/profile/410/1/STBo...,580.0,4.239655,0,1,0,22,52,43,4,...,0.181034,0.077586,0.063793,0.027586,0.000000,0.000000,0.000000,0.779310,0.000000,0.072414


In [49]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from statistics import mean

warnings.filterwarnings(
    "ignore", category=FutureWarning, message=".*max_features='auto'"
)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

X = simple_data.drop(["battleneturl"], axis=1)
y = simple_data["battleneturl"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=rf_model, param_grid=param_grid, scoring="accuracy", cv=5, n_jobs=-1
)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

# avg = []
# for i in range(1, 51, 5):
#     best_rf_model = RandomForestClassifier(
#         # max_features="auto",
#         # min_samples_leaf=1,
#         # min_samples_split=2,
#         # n_estimators=200,
#         random_state=i,
#     )
#     best_rf_model.fit(X_train, y_train)

#     y_pred = best_rf_model.predict(X_test)

#     avg.append(accuracy_score(y_test, y_pred))

# print("Average accuracy score: ", mean(avg))

# best_rf_model = RandomForestClassifier(
#     max_features="auto",
#     min_samples_leaf=1,
#     min_samples_split=2,
#     n_estimators=200,
#     random_state=42,
# )
# best_rf_model.fit(X_train, y_train)

y_pred = best_rf_model.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))



Best Parameters: {'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
                                                             precision    recall  f1-score   support

      http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/     1.0000    1.0000    1.0000         4
 http://eu.battle.net/sc2/en/profile/1058669/1/EmpireHappy/     0.3333    1.0000    0.5000         2
       http://eu.battle.net/sc2/en/profile/1058669/1/Happy/     1.0000    0.2000    0.3333         5
  http://eu.battle.net/sc2/en/profile/1139573/1/BabyKnight/     1.0000    1.0000    1.0000         5
    http://eu.battle.net/sc2/en/profile/1143713/1/uThermal/     1.0000    1.0000    1.0000         3
http://eu.battle.net/sc2/en/profile/1173786/1/VPbenQsLivko/     1.0000    1.0000    1.0000         1
    http://eu.battle.net/sc2/en/profile/1430346/1/ShoWTimE/     1.0000    1.0000    1.0000         2
     http://eu.battle.net/sc2/en/profile/2048063/1/AcerBly/     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
feature_importances = pd.DataFrame(
    best_rf_model.feature_importances_, index=X.columns, columns=["importance"]
).sort_values("importance", ascending=False)
print("Feature Importances:\n", feature_importances)

Feature Importances:
                                       importance
use_hotkey_5                            0.044721
use_hotkey_4                            0.041876
use_hotkey_3                            0.039196
use_hotkey_6                            0.038205
total_use_hotkeys_count_per_sec         0.036408
use_hotkey_2                            0.035726
use_hotkey_1                            0.035047
action_per_sec                          0.034517
creation_hotkey_3                       0.034387
creation_hotkey_1                       0.032392
total_creation_hotkeys_count_per_sec    0.031344
creation_hotkey_2                       0.031137
creation_hotkey_4                       0.030307
s_per_sec                               0.030203
base_per_sec                            0.030091
actions_before_t10                      0.029150
max_actions_between_t                   0.029137
use_hotkey_0                            0.026642
creation_hotkey_5                       0.02639

## TEST


In [51]:
# Open test file
features_test = read_ds_gzip(
    Path(
        os.path.abspath(
            "/Users/milofournier/Documents/Work/INSA/OT2/data mining/in-star-craft-2-player"
        )
    )
    / "TEST.CSV.GZ",
    ds="TEST",
)

# Get number of actions
features_test["first_nan_index"] = features_test.apply(first_nan_occurrence, axis=1)

# Get game length
features_test["max_t_value"] = features_test.apply(max_t_value, axis=1)
features_test["first_nan_index"] = pd.to_numeric(
    features_test["first_nan_index"], errors="coerce"
)

# Get number of actions per sec
features_test["action_per_sec"] = (
    features_test["first_nan_index"] / features_test["max_t_value"]
)

# Get race played
features_test["played_protoss"] = (features_test["played_race"] == "Protoss").astype(
    int
)
features_test["played_terran"] = (features_test["played_race"] == "Terran").astype(int)
features_test["played_zerg"] = (features_test["played_race"] == "Zerg").astype(int)
features_test = features_test.drop(columns=["played_race"])

# Get hotkeys features
for pattern in patterns:
    features_test[pattern] = features_test.apply(
        lambda row: count_hotkeys(row, pattern), axis=1
    )

features_test["actions_before_t5"] = features_test.iloc[:, 1:].apply(
    lambda x: count_actions_before_t(x, "t5"), axis=1
)

features_test["actions_before_t10"] = features_test.iloc[:, 1:].apply(
    lambda x: count_actions_before_t(x, "t10"), axis=1
)

# Apply the function to each row of the DataFrame (excluding the 'battleneturl' column)
features_test["max_actions_between_t"] = features_test.iloc[:, 1:].apply(
    max_actions_between_t, axis=1
)

features_test["min_actions_between_t"] = features_test.iloc[:, 1:].apply(
    min_actions_between_t, axis=1
)

# Model predictions
fjeizo = features_test[
    [
        "max_t_value",
        "action_per_sec",
        "played_protoss",
        "played_terran",
        "played_zerg",
        "actions_before_t5",
        "actions_before_t10",
        "max_actions_between_t",
        "min_actions_between_t",
    ]
    + patterns
]
fjeizo = fjeizo.fillna(0)
y_pred = best_rf_model.predict(fjeizo)

y_pred

array(['http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/',
       'http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/',
       'http://eu.battle.net/sc2/en/profile/3368730/1/ToD/',
       'http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/',
       'http://eu.battle.net/sc2/en/profile/3074362/1/Stardust/',
       'http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/',
       'http://eu.battle.net/sc2/en/profile/3074362/1/Stardust/',
       'http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/',
       'http://eu.battle.net/sc2/en/profile/3368730/1/ToD/',
       'http://eu.battle.net/sc2/en/profile/3368730/1/ToD/',
       'http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/',
       'http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/',
       'http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/',
       'http://eu.battle.net/sc2/en/profile/3368730/1/ToD/',
       'http://eu.battle.net/sc2/en/profile/3074362/1/Stardust/',
       'http://eu.battle.net/sc2/en/profile/22

In [52]:
submit_file = pd.read_csv(
    "/Users/milofournier/Documents/Work/INSA/OT2/data mining/in-star-craft-2-player/SAMPLE_SUBMISSION.CSV"
)

submit_file

Unnamed: 0,RowId,prediction
0,1,http://us.battle.net/sc2/en/profile/3948354/1/...
1,2,http://us.battle.net/sc2/en/profile/3948354/1/...
2,3,http://us.battle.net/sc2/en/profile/3948354/1/...
3,4,http://us.battle.net/sc2/en/profile/3948354/1/...
4,5,http://us.battle.net/sc2/en/profile/3948354/1/...
...,...,...
335,336,http://us.battle.net/sc2/en/profile/3948354/1/...
336,337,http://us.battle.net/sc2/en/profile/3948354/1/...
337,338,http://us.battle.net/sc2/en/profile/3948354/1/...
338,339,http://us.battle.net/sc2/en/profile/3948354/1/...


In [53]:
submit_file["prediction"] = y_pred
submit_file

Unnamed: 0,RowId,prediction
0,1,http://eu.battle.net/sc2/en/profile/2222468/1/...
1,2,http://eu.battle.net/sc2/en/profile/1021189/1/...
2,3,http://eu.battle.net/sc2/en/profile/3368730/1/...
3,4,http://eu.battle.net/sc2/en/profile/2222468/1/...
4,5,http://eu.battle.net/sc2/en/profile/3074362/1/...
...,...,...
335,336,http://eu.battle.net/sc2/en/profile/1021189/1/...
336,337,http://eu.battle.net/sc2/en/profile/3368730/1/...
337,338,http://eu.battle.net/sc2/en/profile/1021189/1/...
338,339,http://eu.battle.net/sc2/en/profile/1021189/1/...


In [54]:
submit_file.to_csv("updated_submit_file.csv", index=False)

In [55]:
submit_file["prediction"].value_counts()

http://eu.battle.net/sc2/en/profile/2222468/1/dTefel/      126
http://eu.battle.net/sc2/en/profile/3368730/1/ToD/         104
http://eu.battle.net/sc2/en/profile/1021189/1/Dayshi/       82
http://eu.battle.net/sc2/en/profile/3074362/1/Stardust/     28
Name: prediction, dtype: int64

In [56]:
submit_file_0_80927 = pd.read_csv(
    "/Users/milofournier/Documents/Work/INSA/OT2/data mining/updated_submit_file_ 0_80927.csv"
)

submit_file_0_80927["prediction"].value_counts()

http://kr.battle.net/sc2/en/profile/2342419/1/soO/           6
http://kr.battle.net/sc2/en/profile/2348639/1/ParalyzE/      6
http://kr.battle.net/sc2/en/profile/2343012/1/Maru/          6
http://kr.battle.net/sc2/en/profile/2341467/1/Classic/       6
http://kr.battle.net/sc2/en/profile/2332264/1/Zest/          5
                                                            ..
http://eu.battle.net/sc2/en/profile/3971497/1/Genius/        1
http://us.battle.net/sc2/en/profile/5142993/1/Bomber/        1
http://kr.battle.net/sc2/en/profile/2342294/1/Impact/        1
http://us.battle.net/sc2/en/profile/4580578/1/ZooCourage/    1
http://xx.battle.net/sc2/en/profile/410/1/STBomber/          1
Name: prediction, Length: 175, dtype: int64