Loading Data into Data Frame and Exploring it

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('game_data.csv')

In [None]:
df.head()

Unnamed: 0,health,x,y,is_jumping,is_crouching,move_id,enemy_health,enemy_x,enemy_y,action
0,176,205,192,0,0,0,176,336,192,far_uppercut
1,176,205,192,0,0,0,176,336,192,far_combo_1
2,176,205,192,0,0,131073,176,336,192,far_uppercut
3,176,205,192,0,0,131073,176,336,192,far_combo_1
4,176,205,192,0,0,131073,176,336,192,far_uppercut


In [None]:
print(df.dtypes)


health           int64
x                int64
y                int64
is_jumping       int64
is_crouching     int64
move_id          int64
enemy_health     int64
enemy_x          int64
enemy_y          int64
action          object
dtype: object


In [None]:
for col in df.columns:
    print(f"--- {col} ---")
    print(df[col].unique())
    print()


--- health ---
[176 146 117  81  34 145 114]

--- x ---
[205 215 207 209 210 212 213 216 218 219 241 223 226 228 208 232 233 235
 257 259 242 245 224 249 251 252 275 276 258 260 262 243 267 268 270 271
 273 274 277 278 280 283 284 287 289 279 266 256 248 247 246 244 239 234
 229 225 221 214 211 206 204 203 202 201 200 199 236 250 253 264 281 288
 291 295 298 302 305 309 312 319 217 357 361 364 368 371 230 385 389 392
 396 399 403 406 410 194 190 186 182 179 176 173 171 169 168 167 166 165
 164 238 189 263 170 172 175 181 191 195 198 220 187 231 237 240 254 269
 261 272 313 316 323 327 330 337 341 355 358 362 365 369 372 376 379 382
 386 393 407 414 424 445 456 476 483 487 299 285 290 292 315 297 300 303
 282 306 307 310 332 314 324 329 333 335 336 339 340 343 345 346 348 350
 352 354 370 356 417 394 411 418 422 360 363 366 367 373 374 395 397 377
 380 381 387 390 402 408 409 412 415 416 419 421 425 426 428 427 440 388
 384 383 430 391 398 400 423 432 434 435 437 439 442 444 446 447 464

Checking for all the Classes and how many entries are there

In [None]:
print(df["action"].value_counts())


action
far_combo_2       1064
far_combo_1       1056
far_uppercut      1048
crouch_special     473
walk_left          445
walk_right          42
close_combo_1       24
close_uppercut      23
close_combo_2       16
Name: count, dtype: int64


Describing all the Classes

In [None]:
print(df.describe(include='all'))


             health            x            y   is_jumping  is_crouching  \
count   4191.000000  4191.000000  4191.000000  4191.000000   4191.000000   
unique          NaN          NaN          NaN          NaN           NaN   
top             NaN          NaN          NaN          NaN           NaN   
freq            NaN          NaN          NaN          NaN           NaN   
mean     111.780959   281.018611   184.755667     0.251730      0.103794   
std       37.375451    83.525822    15.118038     0.434059      0.305029   
min       34.000000   100.000000    90.000000     0.000000      0.000000   
25%       81.000000   211.000000   182.000000     0.000000      0.000000   
50%      114.000000   265.000000   192.000000     0.000000      0.000000   
75%      145.000000   357.000000   192.000000     1.000000      0.000000   
max      176.000000   487.000000   192.000000     1.000000      1.000000   

             move_id  enemy_health      enemy_x      enemy_y       action  
count   4.1

Applying Different Types of model to see which one Performs the best

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("game_data.csv")
X = df.drop("action", axis=1)
y = df["action"]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# ============================ Random Forest ============================
rf = RandomForestClassifier(n_estimators=200, max_depth=20, class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\n=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

# ============================ XGBoost with GPU ============================
xgb = XGBClassifier(tree_method='gpu_hist', predictor='gpu_predictor',
                    use_label_encoder=False, n_estimators=200, max_depth=20, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("\n=== XGBoost (GPU) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

# ============================ MLP Classifier ============================
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
print("\n=== MLP (Neural Net) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp, target_names=le.classes_))

# ============================ Logistic Regression ============================
logreg = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', multi_class='multinomial')
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print("\n=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log, target_names=le.classes_))



=== Random Forest ===
Accuracy: 0.6190677518232744
                precision    recall  f1-score   support

 close_combo_1       0.79      0.75      0.77      1052
 close_combo_2       0.85      0.69      0.76      1036
close_uppercut       0.67      0.82      0.74      1082
crouch_special       0.68      0.57      0.62      1011
   far_combo_1       0.31      0.35      0.33      1020
   far_combo_2       0.33      0.33      0.33      1071
  far_uppercut       0.34      0.31      0.32      1059
     walk_left       0.70      0.76      0.73      1044
    walk_right       0.92      0.98      0.95      1086

      accuracy                           0.62      9461
     macro avg       0.62      0.62      0.62      9461
  weighted avg       0.62      0.62      0.62      9461


=== XGBoost (GPU) ===
Accuracy: 0.6144170806468661
                precision    recall  f1-score   support

 close_combo_1       0.78      0.73      0.76      1052
 close_combo_2       0.85      0.68      0.76      1

Random Forest Shows Promising Results with best accuracy hence will tune its parameter

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib  # for saving model and encoder

# Load data
df = pd.read_csv("game_data.csv")
X = df.drop("action", axis=1)
y = df["action"]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply SMOTE to fix class imbalance
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y_encoded)

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Run GridSearchCV
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='f1_weighted', verbose=2)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print evaluation metrics
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated F1 Score:", grid_search.best_score_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validated F1 Score: 0.6139883029840923
Test Accuracy: 0.6190677518232744

Classification Report:
                 precision    recall  f1-score   support

 close_combo_1       0.79      0.75      0.77      1052
 close_combo_2       0.85      0.69      0.76      1036
close_uppercut       0.67      0.82      0.74      1082
crouch_special       0.68      0.57      0.62      1011
   far_combo_1       0.31      0.35      0.33      1020
   far_combo_2       0.33      0.33      0.33      1071
  far_uppercut       0.34      0.31      0.32      1059
     walk_left       0.70      0.76      0.73      1044
    walk_right       0.92      0.98      0.95      1086

      accuracy                           0.62      9461
     macro avg       0.62      0.62      0.62      9461
  weighted avg       0.62      0.62      0.62      94

Since Hyper-parameter Tuning didn't improve accuracy hence applying different type of class balanacing techniques and see which gives the best accuracy using random forest as it was showing promising results

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("game_data.csv")
X = df.drop("action", axis=1)
y = df["action"]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Calculate class distribution
class_counts = pd.Series(y_encoded).value_counts()
minority_classes = class_counts[class_counts < class_counts.mean()].index.tolist()

# Define resampling strategies
resampling_strategies = {
    "SMOTE": SMOTE(random_state=42),
    "ADASYN": ADASYN(sampling_strategy={label: class_counts.max() for label in minority_classes}, random_state=42),
    "SMOTEENN": SMOTEENN(random_state=42)
}

results = {}

for method_name, sampler in resampling_strategies.items():
    try:
        print(f"\n=== {method_name} ===")
        X_res, y_res = sampler.fit_resample(X, y_encoded)

        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

        model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, class_weight='balanced')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.4f}")
        print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
        results[method_name] = acc
    except ValueError as e:
        print(f"{method_name} failed: {e}")

# Summary
print("\n=== Summary of Accuracies ===")
for method, acc in results.items():
    print(f"{method}: {acc:.4f}")



=== SMOTE ===
Accuracy: 0.6191
Classification Report:
                 precision    recall  f1-score   support

 close_combo_1       0.79      0.75      0.77      1052
 close_combo_2       0.85      0.69      0.76      1036
close_uppercut       0.67      0.82      0.74      1082
crouch_special       0.68      0.57      0.62      1011
   far_combo_1       0.31      0.35      0.33      1020
   far_combo_2       0.33      0.33      0.33      1071
  far_uppercut       0.34      0.31      0.32      1059
     walk_left       0.70      0.76      0.73      1044
    walk_right       0.92      0.98      0.95      1086

      accuracy                           0.62      9461
     macro avg       0.62      0.62      0.62      9461
  weighted avg       0.62      0.62      0.62      9461


=== ADASYN ===
Accuracy: 0.6203
Classification Report:
                 precision    recall  f1-score   support

 close_combo_1       0.62      0.93      0.74      1063
 close_combo_2       0.87      0.69      0.

Using SMOTEENN gave us the best result hence would be using it to balance our classes and train our model

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.combine import SMOTEENN
import joblib

# Load the dataset
df = pd.read_csv("game_data.csv")
X = df.drop("action", axis=1)
y = df["action"]

# Encode class labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply SMOTEENN (SMOTE + Edited Nearest Neighbors)
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X, y_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Random Forest model
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("=== SMOTEENN Final Model Evaluation ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model and label encoder
joblib.dump(clf, "model.pkl")
joblib.dump(le, "label_encoder.pkl")
print(" Model saved as 'model.pkl'")
print(" Label encoder saved as 'label_encoder.pkl'")


=== SMOTEENN Final Model Evaluation ===
Accuracy: 0.9795

Classification Report:
                 precision    recall  f1-score   support

 close_combo_1       0.99      0.99      0.99       517
 close_combo_2       0.99      1.00      0.99       622
close_uppercut       0.99      0.99      0.99       504
crouch_special       0.98      0.97      0.97       347
   far_combo_1       0.82      0.79      0.80        52
   far_combo_2       0.94      0.89      0.92        92
  far_uppercut       0.72      0.74      0.73        38
     walk_left       0.95      0.98      0.97       360
    walk_right       1.00      1.00      1.00       687

      accuracy                           0.98      3219
     macro avg       0.93      0.93      0.93      3219
  weighted avg       0.98      0.98      0.98      3219


Confusion Matrix:
 [[512   3   2   0   0   0   0   0   0]
 [  1 620   1   0   0   0   0   0   0]
 [  3   4 497   0   0   0   0   0   0]
 [  0   0   0 335   0   0   0  10   2]
 [  0   0  