In [2]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [47]:
# Load the data
cheaters = np.load('csgo_dataset/cheaters/cheaters.npy')
legit = np.load('csgo_dataset/legit/legit.npy')

In [48]:
# Combine data
X = np.vstack((cheaters, legit))

In [49]:
# Extract labels (the fifth feature of the last timestep of each engagement) one label per player-engagement combo
y = X[:, :, -1, 4]

In [50]:
y.shape

(12000, 30)

In [51]:
# Remove labels from features
X = X[:, :, :, :4]

In [52]:
X.shape

(12000, 30, 192, 4)

In [54]:
def extract_features(X):
    """
    Extract features from the engagement data
    X shape: (192, 4) - single engagement
    """
    # Mean of each variable over time
    mean_features = np.mean(X, axis=0)
    
    # Standard deviation of each variable over time
    std_features = np.std(X, axis=0)
    
    # Max absolute change in yaw and pitch
    max_yaw_change = np.max(np.abs(np.diff(X[:, 0])))
    max_pitch_change = np.max(np.abs(np.diff(X[:, 1])))
    
    return np.hstack([mean_features, std_features, max_yaw_change, max_pitch_change])

# Process players
n_players = X.shape[0]
n_features_per_engagement = 10  # 4 means + 4 stds + 2 max changes
X_features = np.zeros((n_players, 30 * n_features_per_engagement))

# Extract features for each player's engagements
for player in range(n_players):
    player_features = []
    for engagement in range(30):
        # Process one engagement
        engagement_features = extract_features(X[player, engagement])
        player_features.extend(engagement_features)
    X_features[player] = np.array(player_features)
    
# Take first label for each player (since they're all the same)
y_player = y[:, 0]

In [56]:
print("Shapes before CV split:")
print("X_features shape:", X_features.shape)
print("y shape:", y_player.shape)
print("Class distribution:", np.unique(y_player, return_counts=True))

Shapes before CV split:
X_features shape: (12000, 300)
y shape: (12000,)
Class distribution: (array([0., 1.], dtype=float32), array([11601,   399]))


In [57]:
# Create pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [59]:
# Perform cross-validation
for fold, (train_idx, val_idx) in enumerate(cv.split(X_features, y_player, groups=np.arange(n_players))):
    X_train, X_val = X_features[train_idx], X_features[val_idx]
    y_train, y_val = y_player[train_idx], y_player[val_idx]
    
    print(f"\nFold {fold + 1}:")
    print(f"Train set class distribution: {np.unique(y_train, return_counts=True)}")
    print(f"Validation set class distribution: {np.unique(y_val, return_counts=True)}")
    
    # Fit and predict
    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_val)
    
    # Get classification report
    print(f"\nFold {fold + 1} Results:")
    print(classification_report(y_val, y_pred))
    
    report = classification_report(y_val, y_pred, output_dict=True)
    fold_reports.append(report)


Fold 1:
Train set class distribution: (array([0., 1.], dtype=float32), array([9284,  316]))
Validation set class distribution: (array([0., 1.], dtype=float32), array([2317,   83]))

Fold 1 Results:
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      2317
         1.0       0.22      0.02      0.04        83

    accuracy                           0.96      2400
   macro avg       0.59      0.51      0.51      2400
weighted avg       0.94      0.96      0.95      2400


Fold 2:
Train set class distribution: (array([0., 1.], dtype=float32), array([9279,  321]))
Validation set class distribution: (array([0., 1.], dtype=float32), array([2322,   78]))

Fold 2 Results:
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      2322
         1.0       0.45      0.06      0.11        78

    accuracy                           0.97      2400
   macro avg       0.71      0.53      0.55      2400
wei

In [76]:
# Calculate average metrics across folds
print("\nAveraged metrics across all folds:")
metrics = ['precision', 'recall', 'f1-score']
classes = ['0.0', '1.0']  # non-cheater and cheater

for metric in metrics:
    for cls in classes:
        values = [fold[cls][metric] * 100 for fold in fold_reports]
        mean_value = np.mean(values)
        std_value = np.std(values)
        print(f"Class {cls} {metric}: {mean_value:.1f} (+/- {std_value:.1f})")

# Calculate average accuracy
accuracies = [fold['accuracy'] * 100 for fold in fold_reports]
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f"Average accuracy: {mean_accuracy:.1f}% (+/- {std_accuracy:.1f}%)")


Averaged metrics across all folds:
Class 0.0 precision: 96.8 (+/- 0.1)
Class 1.0 precision: 28.4 (+/- 11.0)
Class 0.0 recall: 99.8 (+/- 0.1)
Class 1.0 recall: 2.8 (+/- 1.9)
Class 0.0 f1-score: 98.2 (+/- 0.1)
Class 1.0 f1-score: 5.0 (+/- 3.3)
Average accuracy: 96.5% (+/- 0.2%)


In [63]:
# Get feature importances from the last fold's model
feature_importances = rf_pipeline.named_steps['rf'].feature_importances_
top_n = 10
top_indices = np.argsort(feature_importances)[-top_n:]
print("\nTop 10 most important features:")
for idx in reversed(top_indices):
    engagement_num = idx // n_features_per_engagement
    feature_num = idx % n_features_per_engagement
    feature_type = ['mean_1', 'mean_2', 'mean_3', 'mean_4', 
                   'std_1', 'std_2', 'std_3', 'std_4',
                   'max_yaw_change', 'max_pitch_change'][feature_num]
    print(f"Engagement {engagement_num}, {feature_type}: {feature_importances[idx]:.4f}")


Top 10 most important features:
Engagement 1, mean_2: 0.0131
Engagement 16, mean_3: 0.0107
Engagement 22, std_1: 0.0097
Engagement 2, mean_3: 0.0096
Engagement 4, std_3: 0.0086
Engagement 25, std_4: 0.0084
Engagement 16, std_3: 0.0083
Engagement 23, std_1: 0.0083
Engagement 7, std_1: 0.0079
Engagement 10, mean_4: 0.0077
