# Classification Model with XGBoost

This notebook implements a classification model using XGBoost with hyperparameter tuning.

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, f1_score
import os

## Load Datasets

In [2]:
# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Display the first few rows of the training data
print("Training data shape:", train_df.shape)
train_df.head()

Training data shape: (73446, 36)


Unnamed: 0,id,remaining_time,B_score,A_score,location,device_active,B_health,A_health,B_armor,A_armor,...,A_Sniper,A_Heavy,A_Rifle,B_total_grenades,A_total_grenades,player_footstep_volume,voice_line_frequency,map_anomaly_detected,chicken_deaths,target
0,1,157.25,0.0,0.0,de_nuke,False,500.0,500.0,400.0,300.0,...,0.0,0.0,0.0,0.0,1.0,,2.374536,0.0,0.0,B
1,2,,6.0,5.0,de_mirage,False,500.0,500.0,0.0,482.0,...,1.0,0.0,4.0,0.0,3.0,1.23256,7.624727,0.0,2.0,A
2,4,49.95,0.0,7.0,de_inferno,,462.0,,498.0,340.0,...,1.0,0.0,3.0,4.0,5.0,0.793854,2.1622,0.0,1.0,B
3,5,,0.0,0.0,de_inferno,False,500.0,500.0,300.0,300.0,...,0.0,0.0,0.0,0.0,2.0,0.90125,4.549794,0.0,2.0,B
4,6,,12.0,8.0,de_inferno,True,184.0,391.0,200.0,400.0,...,1.0,0.0,,2.0,1.0,0.845802,3.379706,0.0,0.0,A


## Feature Engineering on Training Data

In [3]:
# Map target variable to numeric values
label_map = {'A': 0, 'B': 1}
train_df['target'] = train_df['target'].map(label_map)
train_df = train_df.dropna(subset=['target'])

# Create new features
# Score difference between Team A and B
train_df['score_diff'] = train_df['A_score'] - train_df['B_score']

# Health difference between teams
train_df['health_diff'] = train_df['A_health'] - train_df['B_health']

# Armor difference between teams
train_df['armor_diff'] = train_df['A_armor'] - train_df['B_armor']

# Display the engineered features
train_df[['score_diff', 'health_diff', 'armor_diff']].describe()

Unnamed: 0,score_diff,health_diff,armor_diff
count,68434.0,60314.0,71685.0
mean,0.069994,-9.458699,-14.874772
std,4.152878,95.827128,205.233296
min,-14.0,-500.0,-500.0
25%,-3.0,-18.0,-107.0
50%,0.0,0.0,0.0
75%,3.0,0.0,97.0
max,15.0,475.0,500.0


## Preprocessing

In [4]:
# Categorical columns to encode
categorical_cols = ['location', 'device_active']
# Make sure these are strings
train_df[categorical_cols] = train_df[categorical_cols].astype(str)

# Label encoding for categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    label_encoders[col] = le
    
# Show encoded categorical columns
train_df[categorical_cols].head()

Unnamed: 0,location,device_active
0,4,0
1,3,0
2,2,2
3,2,0
4,2,1


In [5]:
# Prepare feature matrix and target vector
# Drop id and target columns from features
X = train_df.drop(['id', 'target'], axis=1)
y = train_df['target']

# Impute missing values (using mean strategy)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)

Training set shape: (58756, 37)
Validation set shape: (14690, 37)


## Hyperparameter Tuning with GridSearchCV

In [6]:
# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Use F1-score as our evaluation metric
f1_scorer = make_scorer(f1_score)

# Initialize XGBClassifier
xgb_clf = XGBClassifier(eval_metric='logloss', random_state=42)

In [7]:
# Grid Search (Note: This cell may take some time to execute)
grid_search = GridSearchCV(estimator=xgb_clf,
                           param_grid=param_grid,
                           scoring=f1_scorer,
                           cv=3,
                           verbose=1,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 300, 'subsample': 1.0}


In [8]:
# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate on validation set
val_preds = best_model.predict(X_val)
val_f1 = f1_score(y_val, val_preds)
print("Validation F1-Score:", val_f1)

Validation F1-Score: 0.7901303799242956


## Process Test Data with Same Transformations

In [9]:
# Display the first few rows of the test data
print("Test data shape:", test_df.shape)
test_df.head()

Test data shape: (48964, 35)


Unnamed: 0,id,remaining_time,B_score,A_score,location,device_active,B_health,A_health,B_armor,A_armor,...,A_SMG,A_Sniper,A_Heavy,A_Rifle,B_total_grenades,A_total_grenades,player_footstep_volume,voice_line_frequency,map_anomaly_detected,chicken_deaths
0,47053,,9.0,13.0,de_train,False,500.0,500.0,500.0,500.0,...,0.0,1.0,,4.0,16.0,15.0,0.89962,1.188077,0.0,1.0
1,28740,114.92,12.0,13.0,de_inferno,,,500.0,,500.0,...,0.0,1.0,0.0,4.0,19.0,16.0,0.974823,5.625995,0.0,2.0
2,92746,94.95,10.0,15.0,de_dust2,False,500.0,500.0,481.0,500.0,...,0.0,1.0,0.0,4.0,12.0,12.0,1.084425,7.142952,0.0,
3,60470,74.8,15.0,15.0,de_mirage,False,,384.0,277.0,400.0,...,0.0,0.0,0.0,4.0,3.0,7.0,0.813555,3.760457,0.0,0.0
4,42953,54.93,1.0,3.0,de_vertigo,False,461.0,300.0,485.0,0.0,...,0.0,0.0,0.0,0.0,10.0,0.0,0.96272,3.647533,0.0,1.0


In [10]:
# Apply same feature engineering to test data
test_df['score_diff'] = test_df['A_score'] - test_df['B_score']
test_df['health_diff'] = test_df['A_health'] - test_df['B_health']
test_df['armor_diff'] = test_df['A_armor'] - test_df['B_armor']

# Encode categorical columns in test data
test_df[categorical_cols] = test_df[categorical_cols].astype(str)
for col in categorical_cols:
    test_df[col] = label_encoders[col].transform(test_df[col])

# Prepare test features and impute missing values
X_test = test_df.drop(['id'], axis=1)
X_test_imputed = imputer.transform(X_test)

## Final Predictions and Submission File

In [11]:
# Predict using the best model from grid search
test_preds_numeric = best_model.predict(X_test_imputed)

# Convert predictions back to labels ('A' or 'B')
inverse_label_map = {0: 'A', 1: 'B'}
test_preds = [inverse_label_map[pred] for pred in test_preds_numeric]

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'target': test_preds
})

# Display the first few rows of the submission file
submission_df.head()

Unnamed: 0,id,target
0,47053,B
1,28740,B
2,92746,B
3,60470,A
4,42953,B


In [12]:
# Save submission file
submission_path = "classification_submission.csv"
submission_df.to_csv(submission_path, index=False)

# Verify that file is created
if os.path.exists(submission_path):
    print("Submission file created:", submission_path)
    print(submission_df.head())
else:
    print("Error: Submission file not found.")

Submission file created: classification_submission.csv
      id target
0  47053      B
1  28740      B
2  92746      B
3  60470      A
4  42953      B
