## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import numba as nb
import polars as pl

import pickle
import gc

import catboost
from sklearn.model_selection import KFold, cross_val_score

import warnings
warnings.filterwarnings('ignore')

## 2. Data

In [2]:
df_train = pd.read_csv("/root/kag_comp/UM_MCTS/Data/train.csv")
df_test = pd.read_csv("/root/kag_comp/UM_MCTS/Data/test.csv")
df_concepts = pd.read_csv("/root/kag_comp/UM_MCTS/Data/concepts.csv")

In [3]:
agent1 = df_train.agent1.str.split('-', expand = True).drop(0, axis = 1).rename({1:"Selection1", 2:"EXPLORATION_CONST1", 3:"PLAYOUT1", 4:"SCORE_BOUNDS1"}, axis = 1).assign(SCORE_BOUNDS1 = lambda _df : _df.SCORE_BOUNDS1.map(lambda x : 0 if x == "false" else 1).astype(int))
agent2 = df_train.agent2.str.split('-', expand = True).drop(0, axis = 1).rename({1:"Selection2", 2:"EXPLORATION_CONST2", 3:"PLAYOUT2", 4:"SCORE_BOUNDS2"}, axis = 1).assign(SCORE_BOUNDS2 = lambda _df : _df.SCORE_BOUNDS2.map(lambda x : 0 if x == "false" else 1).astype(int))

df_preprocessed = pd.concat([df_train.drop(["Id", "agent1", "agent2", "num_wins_agent1", "num_draws_agent1", "num_losses_agent1"], axis = 1), agent1, agent2], axis = 1)

In [4]:
target = "utility_agent1"
y = df_train[target]

In [5]:
del df_train
gc.collect()

39

## 3. Classifier : 확률값을 안먹는뎁쇼?

In [None]:
categorical_features = ["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "EnglishRules", "LudRules"]

In [None]:
## data
X = df_preprocessed.loc[:, lambda _df : _df.nunique() != 1]

## model
predictr = catboost.CatBoostClassifier(cat_features = categorical_features, verbose = 0, task_type = "GPU", devices = "0")
predictr.fit(X, y)

In [None]:
pool = catboost.Pool(X, y, cat_features = categorical_features)
SKF = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 14107)

In [None]:
validatr = catboost.cv(catboost.Pool(X, y, cat_features = categorical_features), params = {'loss_function': 'RMSE', "task_type" : "GPU", "devices" : "0"}, fold_count = 5, verbose = 0)

> 애초에 Regression으로 푸는 게, 목적에 부합할 듯

## 4. Regression

`-` GPU 써먹을 덴 다 써먹고, 나머지는 어쩔 수 없고...

In [None]:
## data
X = df_preprocessed.loc[:, lambda _df : _df.nunique() != 1].drop("utility_agent1", axis = 1)
categorical_features = ["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "EnglishRules", "LudRules"]

## cv
pool = catboost.Pool(X, y, cat_features = categorical_features)
SKF = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 14107)
validatr = catboost.cv(catboost.Pool(X, y, cat_features = categorical_features), params = {'loss_function': 'RMSE', "task_type" : "GPU", "devices" : "0"}, folds = SKF, verbose = 0)

In [None]:
fst_cat = list(df_concepts.loc[df_concepts.TaxonomyString.str.split(".", expand = True).rename({i:f"Node{i}" for i in range(9)}, axis = 1).Node0 == "1"].Name)

솔직히, 전진선택법은 미쳤다. 이건 제 시간안에 수행할 수 없는 것이다.

-> 계층적 구조를 이용하여 선택하는 것이 정신건강에 좋을 것으로 판단된다.

In [None]:
node_form = df_concepts.TaxonomyString.str.split(".", expand = True).rename({i:f"Node{i}" for i in range(9)}, axis = 1)
node_form.loc[node_form.Node0 == "1"]

In [None]:
## data
X = df_preprocessed[["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "SCORE_BOUNDS1", "SCORE_BOUNDS2"]+fst_cat].loc[:, lambda _df : _df.nunique() != 1]
categorical_features = ["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "EnglishRules", "LudRules"]

## validator
SKF = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 14107)
additional_list = list(set(X.columns) - set(["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "SCORE_BOUNDS1", "SCORE_BOUNDS2"]))

for f in additional_list :
    X_subset = X[["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "SCORE_BOUNDS1", "SCORE_BOUNDS2"] + f]

model = catboost.CatBoostRegressor(cat_features = list(set(categorical_features) & set(X_subset.columns)), loss_function = "RMSE", task_type = "GPU", devices = "0")

In [None]:
cross_val_score(model, X_subset, y, cv=kf, scoring='neg_mean_absolute_error')

내 생각에는, `feature_importances_`를 구해놓고, 이게 낮은 순서대로 정렬한 다음, 낮은 것부터 하나하나 빼가면서 모형이 개선되었으면 그거 아예 제거해버리고, 다시 중요도 계산하는 과정을 재귀적으로 반복하면 좋지 않을까라는 거임.

In [27]:
## full model
X = df_preprocessed.loc[:, lambda _df : _df.nunique() != 1].drop("utility_agent1", axis = 1)
categorical_features = ["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "EnglishRules", "LudRules"]

full_model = catboost.CatBoostRegressor(cat_features = categorical_features, learning_rate = 0.005, iterations = 6000, early_stopping_rounds = 50, loss_function = "RMSE", verbose = 0, task_type = "GPU", devices = "0")
full_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7ef6841e1120>

In [28]:
useful_features = [k for v, k in sorted(zip(full_model.feature_importances_, full_model.feature_names_)) if v > 0.0]

In [29]:
len(useful_features)

407

`-` interaction terms

In [None]:
interactions = full_model.get_feature_importance(fstr_type = "Interaction")
df_interactions = pd.DataFrame(interactions).rename({0:"A", 1:"B", 2:"Interaction"}, axis = 1).assign(A = lambda _df : _df.A.astype(int)).assign(B = lambda _df : _df.B.astype(int))

In [None]:
df_interactions.assign(A = lambda _df : _df.A.map(lambda x : X.columns[x])).assign(B = lambda _df : _df.B.map(lambda x : X.columns[x])).tail()

> 이걸 안다고 해서 뭐 달라지는 건 딱히 없지 않나...?

`-` 일부만 사용한 것과 비교

In [30]:
cutting_features = [k for v, k in sorted(zip(full_model.feature_importances_, full_model.feature_names_)) if v > 0.001]

In [31]:
len(cutting_features)

344

In [32]:
## first scoring
X = df_preprocessed.loc[:, useful_features]

SKF = KFold(n_splits = 10, shuffle = True, random_state = 14107)
model = catboost.CatBoostRegressor(cat_features = categorical_features, loss_function = "RMSE", verbose = 0, task_type = "GPU", devices = "0")
full_score = np.mean(cross_val_score(model, X, y, cv = SKF, scoring = "neg_root_mean_squared_error"))

In [33]:
full_score

-0.28124717493153223

In [38]:
## second scoring
X = df_preprocessed.loc[:, cutting_features]

SKF = KFold(n_splits = 10, shuffle = True, random_state = 14107)
model = catboost.CatBoostRegressor(cat_features = categorical_features, loss_function = "RMSE", verbose = 0, task_type = "GPU", devices = "0")
second_score = np.mean(cross_val_score(model, X, y, cv = SKF, scoring = "neg_root_mean_squared_error"))

In [39]:
second_score

-0.2809903094331911

In [6]:
def features_selection(current_score, current_features, dump_features = []) :
    X_subset = X[current_features]
    cont = 0
    
    for f in current_features :
        ## counting
        cont += 1
        
        ## data
        X_valid = X_subset.drop(f, axis = 1)

        ## scoring
        SKF = KFold(n_splits = 10, shuffle = True, random_state = 14107)
        model = catboost.CatBoostRegressor(cat_features = list(set(categorical_features) & set(X_valid.columns)), loss_function = "RMSE", verbose = 0, task_type = "GPU", devices = "0")
        score1 = np.mean(cross_val_score(model, X_valid, y, cv = SKF, scoring = "neg_root_mean_squared_error"))

        if score1 >= current_score :
            dumping = dump_features + [f]
            print(f"변수 {f}를 제거하여 성능이 향상됨 : score = {score1}, count = {cont}")
            score2 = np.mean(cross_val_score(model, X_valid, y, cv = SKF, scoring = "neg_root_mean_squared_error")) ## 폴드의 우연성을 줄이기 위해 한번 더 스코어링

            ## 새로운 full model 적합 : importances 기반으로 중요 변수 재선발 및 정렬
            model = catboost.CatBoostRegressor(cat_features = list(set(categorical_features) & set(X_valid.columns)), learning_rate = 0.01, iterations = 3000, loss_function = "RMSE", verbose = 0, task_type = "GPU", devices = "0")
            model.fit(X_valid, y)

            print(f"이후 비교할 score : {(score1 + score2)/2}")

            dumping = list(set(dumping + list(set(X_valid.columns) - set([k for v, k in sorted(zip(model.feature_importances_, model.feature_names_)) if v > 0.0]))))

            print(f"현재 제거된 변수 : {dumping}")
            
            break

        elif f == current_features[-1] :
            print("변수를 모두 제거해보았으나, 성능 향상이 없었음")
            print(f"최종 선택 변수 : {current_features}")

            return

    features_selection((score1+score2)/2, [k for v, k in sorted(zip(model.feature_importances_, model.feature_names_)) if v > 0.0], dumping) ## 재귀

In [177]:
X = df_preprocessed.loc[:, lambda _df : _df.nunique() != 1].drop("utility_agent1", axis = 1)
useful_features = [k for v, k in sorted(zip(full_model.feature_importances_, full_model.feature_names_)) if v > 0.0]
features_selection(full_score, useful_features)

```
변수 Style를 제거하여 성능이 향상됨 : count = 17
현재 제거된 변수 : ['Style']
변수 TwoSitesMoves를 제거하여 성능이 향상됨 : count = 3
현재 제거된 변수 : ['Style', 'TwoSitesMoves']
변수 NoOwnPiecesLossFrequency를 제거하여 성능이 향상됨 : count = 1
현재 제거된 변수 : ['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency']
변수 AnimalComponent를 제거하여 성능이 향상됨 : count = 14
현재 제거된 변수 : ['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency', 'AnimalComponent']
변수 FromToDecisionEnemyFrequency를 제거하여 성능이 향상됨 : count = 13
현재 제거된 변수 : ['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency', 'AnimalComponent', 'FromToDecisionEnemyFrequency']
변수 SquarePyramidalShape를 제거하여 성능이 향상됨 : count = 14
현재 제거된 변수 : ['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency', 'AnimalComponent', 'FromToDecisionEnemyFrequency', 'SquarePyramidalShape']
변수 ConcentricTiling를 제거하여 성능이 향상됨 : count = 6
현재 제거된 변수 : ['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency', 'AnimalComponent', 'FromToDecisionEnemyFrequency', 'SquarePyramidalShape', 'ConcentricTiling']
변수 SingleSiteMoves를 제거하여 성능이 향상됨 : count = 6
현재 제거된 변수 : ['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency', 'AnimalComponent', 'FromToDecisionEnemyFrequency', 'SquarePyramidalShape', 'ConcentricTiling', 'SingleSiteMoves']
```

> 분산이 존재하다보니, scoring이 굉장히 빡세지는 것 같음. 따라서 주기적으로 모형을 갱신해줄 필요가 있어보임. 매회 갱신하는 것도 나쁘지 않고.

In [44]:
current_list = list(set(useful_features) - set(['Style', 'TwoSitesMoves', 'NoOwnPiecesLossFrequency', 'AnimalComponent', 'FromToDecisionEnemyFrequency', 'SquarePyramidalShape', 'ConcentricTiling', 'SingleSiteMoves']))

In [45]:
with open("validation_features.pkl", "wb") as f :
    pickle.dump(current_list, f)

In [46]:
## second scoring
X = df_preprocessed.loc[:, current_list]

SKF = KFold(n_splits = 10, shuffle = True, random_state = 14107)
model = catboost.CatBoostRegressor(cat_features = categorical_features, loss_function = "RMSE", verbose = 0, task_type = "GPU", devices = "0")
current_score = np.mean([np.mean(cross_val_score(model, X, y, cv = SKF, scoring = "neg_root_mean_squared_error")) for i in range(2)])

In [47]:
current_score

-0.28123876715704277

In [1]:
# dump_list = ['Math', 'Region', 'KnightComponent', 'SurakartaStyle', 'SumDice', 'ThreeMensMorrisBoard', 'Start', 'NoTargetPieceEndFrequency', 'BranchingFactorChangeNumTimesn', 'Conditions', 'NoPieceNext', 'RemoveDecisionFrequency', 'RemoveEffect', 'MancalaCircular', 'IsEnemy', 'TerritoryWinFrequency', 'PassDecision', 'HopDecisionFriendToEnemyFrequency', 'OrthogonalDirection', 'PromotionEffectFrequency', 'Timeouts', 'SlideDecisionFrequency', 'SurroundCaptureFrequency', 'InitialRandomPlacement', 'ScoreDifferenceMedian', 'AdjacentDirection', 'SameDirection', 'Variable', 'LineOfSight', 'Checkmate', 'EncloseCapture', 'NumDice', 'TaflStyle', 'ConnectionEndFrequency', 'AlquerqueBoard', 'FromToDecisionFriendFrequency', 'NotEqual', 'HopCaptureMoreThanOne', 'NoMovesEnd', 'Minimum', 'NumOffDiagonalDirections', 'PieceState', 'NoMovesWinFrequency', 'SowCaptureFrequency', 'Draw', 'MovesDecision', 'LeftwardDirection', 'IsPieceAt', 'Addition', 'InitialScore', 'SetInternalCounter', 'AlquerqueBoardWithTwoTriangles', 'ChessStyle', 'DiagonalDirection', 'CheckmateFrequency', 'Threat', 'Operations', 'TriangleTiling', 'MoveDistanceMaxIncrease', 'MoveDistanceVariance', 'FillEndFrequency', 'SlideDecisionToEnemyFrequency', 'NoOwnPiecesEndFrequency', 'DirectionCapture', 'AlquerqueTiling', 'LesserThanOrEqual', 'SowCCW', 'PieceCount', 'ThreeMensMorrisBoardWithTwoTriangles', 'Tiling', 'MovesNonDecision', 'ShowPieceState', 'RectangleShape', 'Conjunction', 'MancalaStores', 'SowProperties', 'PatternWin', 'Comparison', 'HopEffect', 'Priority', 'ProposeEffectFrequency', 'Subtraction', 'NoMoves', 'Pattern', 'TableStyle', 'FromToDecisionEmptyFrequency', 'SowRemoveFrequency', 'SwapOption', 'PiecesPlacedOutsideBoard', 'DirectionCaptureFrequency', 'AddDecision', 'Hand', 'CheckmateWin', 'Then', 'SetSiteState', 'ForwardLeftDirection', 'AddEffect', 'NoMovesDrawFrequency', 'RemoveDecision', 'StepDecisionToEnemyFrequency', 'SwapPlayersDecision', 'ScoreDifferenceVariance', 'SiteState', 'SowBacktrackingFrequency', 'Visual', 'TaflComponent', 'Shape', 'AllDirections', 'Odd', 'HopDecisionFriendToEmptyFrequency', 'TrackOwned', 'FromToDecisionWithinBoardFrequency', 'MoveDistanceChangeNumTimes', 'IsEmpty', 'CheckmateWinFrequency', 'TriangleShape', 'BackwardsDirection', 'LineWin', 'PushEffectFrequency', 'ShibumiStyle', 'CaptureEnd', 'ConditionalStatement', 'SlideDecisionToEmpty', 'SpaceEnd', 'LeapDecisionToEnemyFrequency', 'SowOriginFirst', 'EliminatePiecesWin', 'SpaceConditions', 'CircleShape', 'Logic', 'OpeningContract', 'IsFriend', 'RightwardDirection', 'MoveConditions', 'ControlFlowStatement', 'ReplacementCapture', 'ComponentStyle', 'Intersection', 'LeapDecisionFrequency', 'SCORE_BOUNDS2', 'DiscComponent', 'FairyChessComponent', 'CustodialCapture', 'StackState', 'HexShape', 'Maximum', 'PatternWinFrequency', 'NoTargetPiece', 'NoProgressDrawFrequency', 'Algorithmics', 'Directions', 'SowCW', 'ScoringWinFrequency', 'HexTiling', 'HopDecisionMoreThanOne', 'ScoringEndFrequency', 'Moves', 'Stochastic', 'AlquerqueBoardWithFourTriangles', 'TrackLoop', 'MovesEffects', 'SlideDecisionToEnemy', 'DiamondShape', 'PlayerValue', 'NoPiece', 'NoMovesMover', 'RaceEnd', 'TerritoryEndFrequency', 'RelativeDirections', 'Union', 'PenAndPaperStyle', 'Piece', 'SetCountFrequency', 'SetValueFrequency', 'SowRemove', 'FillWinFrequency', 'AddEffectFrequency', 'PieceConditions', 'NumLeftSites', 'Set', 'Cell', 'EncloseCaptureFrequency', 'PatternEndFrequency', 'NumPhasesBoard', 'MoveDistanceMaxDecrease', 'ForEachPiece', 'ForgetValues', 'Distance', 'SetVar', 'Misere', 'Roll', 'MoveAgain', 'StateType', 'StepEffect', 'Multiplication', 'Symbols', 'Arithmetic', 'NoMovesWin', 'CheckersComponent', 'ScoreDifferenceMaximum', 'Negation', 'HopCapture', 'MancalaTwoRows', 'ForwardDirection', 'SlideDecisionToFriendFrequency', 'NoMovesNext', 'FromToDecision', 'HopDecisionFrequency', 'GreaterThan', 'Disjunction', 'PieceNumberMaxIncrease', 'SquareShape', 'Connection', 'ScoreDifferenceChangeLineBestFit', 'ProgressCheck', 'HopDecision', 'Phase']

> 해당 방법론은 CV를 할 때의 분산을 고려하지 않았다는 점에서 바람직하지 않음. 온전한 후진제거법이 아니므로, 아래 방법으로 개선할 수 있음
>
> * CV error가 단순히 감소한 게 아니라 특정 비율만큼 감소하면 변수를 제거.
>
> * CV error의 분산을 감안하여 2*se를 뺀 만큼 이상의 차이가 존재하면 변수를 제거.

In [14]:
with open("validation_features.pkl", "rb") as f :
    current_list = pickle.load(f)

X = df_preprocessed.loc[:, current_list]
categorical_features = ["GameRulesetName", "Selection1", "Selection2", "EXPLORATION_CONST1", "EXPLORATION_CONST2", "PLAYOUT1", "PLAYOUT2", "EnglishRules", "LudRules"]

In [18]:
# new_set = list(set(current_list) - set(dump_list))

# with open("validation_features2.pkl", "wb") as f :
#     pickle.dump(new_set, f)

In [15]:
features_selection(-0.28123876715704277, current_list)

변수 NoMovesNext를 제거하여 성능이 향상됨 : score = -0.2809771992545718, count = 1
이후 비교할 score : -0.2809597656821487
현재 제거된 변수 : ['Variable', 'MancalaCircular', 'Conditions', 'AddEffect', 'SurroundCaptureFrequency', 'ChessStyle', 'Draw', 'StackState', 'NoTargetPieceWin', 'StateType', 'FairyChessComponent', 'ForwardDirection', 'SowCCW', 'HopDecisionMoreThanOne', 'Shape', 'TaflComponent', 'Phase', 'Region', 'CustodialCapture', 'FortyStonesWithFourGapsBoard', 'Misere', 'NoMovesEnd', 'Union', 'DiamondShape', 'ProgressCheck', 'SetInternalCounter', 'InitialRandomPlacement', 'Threat', 'MoveAgain', 'InitialScore', 'Checkmate', 'MancalaStores', 'SetVar', 'NumOffDiagonalDirections', 'LineOfSight', 'Disjunction', 'SumDice', 'TaflStyle', 'ProposeEffectFrequency', 'ScoreDifferenceMedian', 'NoTargetPiece', 'FromToDecision', 'PenAndPaperStyle', 'MancalaTwoRows', 'RaceEnd', 'CircleShape', 'AlquerqueBoardWithFourTriangles', 'TableStyle', 'CaptureEnd', 'NoPieceNext', 'SowCW', 'ForwardLeftDirection', 'SlideDecisionT

KeyboardInterrupt: 