# Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Data exploration and preparation

## Loading the data

In [2]:
df_train = pd.read_csv('data/rocketskillshots_train.csv')
df_test = pd.read_csv('data/rocketskillshots_test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,window_id,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,up,...,slow_skew,goal_skew,left_skew,boost_skew,camera_skew,down_skew,right_skew,slide_skew,jump_skew,label
0,0,,0.0,2.205022,3817.38,2013.0,,150959.239888,145648.06166,0.0,...,5.656854,3.795046,0.0,1.428526,3.795046,0.0,-1.681134,1.428526,1.021592,6
1,0,0.0,1636.798772,0.0,3498.01,2012.98,,104267.426232,99035.849337,0.0,...,,,,,,,,,,6
2,0,1.0,3198.029397,0.138893,3494.08,2012.98,229.89678,124248.031988,102233.878734,0.0,...,,,,,,,,,,6
3,0,2.0,0.0,0.173617,3494.08,2012.98,,124248.031988,102968.35899,0.0,...,,,,,,,,,,6
4,0,3.0,9914.766242,0.31251,3500.08,2012.98,,115248.016009,112883.125231,0.0,...,,,,,,,,,,6


In [4]:
df_test.head()

Unnamed: 0,id,window_id,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,up,...,accelerate_skew,slow_skew,goal_skew,left_skew,boost_skew,camera_skew,down_skew,right_skew,slide_skew,jump_skew
0,1,,-440.3819,2.639636,1615.84,1038.69,687.146769,139614.040931,109317.500731,0.0,...,1.519583,6.244998,1.160852,3.303881,1.519583,2.725964,0.0,0.0,0.160545,1.160852
1,1,0.0,0.0,0.0,39.78,1704.5,,141332.104937,,0.0,...,,,,,,,,,,
2,1,1.0,127059.24071,0.13895,36.84,1599.63,,130602.701278,160745.080242,0.0,...,,,,,,,,,,
3,1,2.0,-7205.559291,0.27795,36.03,,,135231.763876,153539.520951,0.0,...,,,,,,,,,,
4,1,3.0,-3283.645865,0.3474,,,,130547.576036,150255.875086,0.0,...,,,,,,,,,,


## Data exploration

First, let's check for missing values in the dataset.

In [5]:
df_train.isna().sum()

id                          0
window_id                 178
BallAcceleration           45
Time                        0
DistanceWall               93
DistanceCeil              116
DistanceBall             3112
PlayerSpeed                 0
BallSpeed                  36
up                          0
accelerate                  0
slow                        0
goal                        0
left                        0
boost                       0
camera                      0
down                        0
right                       0
slide                       0
jump                        0
BallAcceleration_skew    3959
Time_skew                3959
DistanceWall_skew        3959
DistanceCeil_skew        3959
DistanceBall_skew        3959
PlayerSpeed_skew         3959
BallSpeed_skew           3959
up_skew                  3959
accelerate_skew          3959
slow_skew                3959
goal_skew                3959
left_skew                3959
boost_skew               3959
camera_ske

From this overview, we see that all *\*\_skew* parameters (eg. BallAcceleration_skew, Time_skew, etc.) have a large number of missing values (only 178 non-null values per parameter).

Aside from these, the DistanceBall parameter has a very large number of missing values (3112 null values).

Most other parameters have either none or a small number of missing values:
- *window\_id*: 178 null values
- *BallAcceleration*: 45 null values
- *DistanceWall*: 93 null values
- *DistanceCeil*: 116 null values
- *DistanceBall*: 3112 null values (!)
- *BallSpeed*: 36 null values
- *\*\_skew*: 3959 null values (!)
- all other parameters: 0 null values

Usually, the missing values are somehow inferred (e.g. by setting them to the mean/median value, or to zero) as to not lose the data that is available, however, given such a large amount of missing values, the *DistanceBall* parameter and all *\*\_skew* parameters will be excluded from further analysis and model training.

In [6]:
df_train = df_train.loc[:,~df_train.columns.str.endswith('_skew')]
df_train.drop('DistanceBall', axis=1)

Unnamed: 0,id,window_id,BallAcceleration,Time,DistanceWall,DistanceCeil,PlayerSpeed,BallSpeed,up,accelerate,slow,goal,left,boost,camera,down,right,slide,jump,label
0,0,,0.000000,2.205022,3817.38,2013.00,150959.239888,145648.061660,0.0,0.0,0,0,0,0,0,0,1,0.0,0.0,6
1,0,0.0,1636.798772,0.000000,3498.01,2012.98,104267.426232,99035.849337,0.0,0.0,0,0,0,0,0,0,1,0.0,1.0,6
2,0,1.0,3198.029397,0.138893,3494.08,2012.98,124248.031988,102233.878734,0.0,0.0,0,0,0,1,0,0,1,0.0,1.0,6
3,0,2.0,0.000000,0.173617,3494.08,2012.98,124248.031988,102968.358990,0.0,0.0,0,0,0,1,0,0,0,0.0,0.0,6
4,0,3.0,9914.766242,0.312510,3500.08,2012.98,115248.016009,112883.125231,0.0,0.0,0,0,0,0,0,0,1,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4132,297,10.0,0.000000,2.057068,3038.61,1945.02,219368.460358,0.000000,0.0,0.0,0,0,0,0,0,0,1,1.0,0.0,-1
4133,297,11.0,0.000000,2.126637,3082.31,,220365.796688,0.000000,0.0,0.0,0,0,0,0,1,0,1,1.0,0.0,-1
4134,297,12.0,0.000000,2.196237,3126.01,1986.26,221520.167700,0.000000,0.0,1.0,0,0,0,1,0,0,1,1.0,0.0,-1
4135,297,13.0,0.000000,2.300599,3160.95,2005.86,221196.340110,0.000000,0.0,1.0,0,0,0,1,1,0,1,1.0,0.0,-1


In [7]:
df_test = df_test.loc[:,~df_test.columns.str.endswith('_skew')]
df_test.drop('DistanceBall', axis=1)

Unnamed: 0,id,window_id,BallAcceleration,Time,DistanceWall,DistanceCeil,PlayerSpeed,BallSpeed,up,accelerate,slow,goal,left,boost,camera,down,right,slide,jump
0,1,,-440.381900,2.639636,1615.84,1038.69,139614.040931,109317.500731,0.0,0.0,0,0,0,0,0,0,1,0.0,0.0
1,1,0.0,0.000000,0.000000,39.78,1704.50,141332.104937,,0.0,0.0,0,0,0,1,0,0,1,0.0,1.0
2,1,1.0,127059.240710,0.138950,36.84,1599.63,130602.701278,160745.080242,0.0,1.0,0,0,0,1,0,0,1,0.0,1.0
3,1,2.0,-7205.559291,0.277950,36.03,,135231.763876,153539.520951,0.0,1.0,0,0,0,1,0,0,1,0.0,0.0
4,1,3.0,-3283.645865,0.347400,,,130547.576036,150255.875086,0.0,0.0,0,0,0,0,0,0,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3048,293,6.0,0.000000,1.704700,1986.28,2012.99,94607.180436,0.000000,0.0,0.0,0,0,0,0,0,0,1,0.0,0.0
3049,293,7.0,0.000000,1.913447,1919.30,2012.99,120126.993449,0.000000,0.0,0.0,0,0,0,0,0,0,1,1.0,0.0
3050,293,8.0,0.000000,2.330900,1980.96,2012.99,156419.448676,0.000000,1.0,0.0,0,0,0,0,0,0,1,1.0,0.0
3051,293,9.0,0.000000,2.504900,2069.08,2012.99,173721.605775,0.000000,0.0,0.0,0,0,0,0,0,0,1,1.0,0.0


There are some other columns with missing values, they will be filled by setting the values to zero.

In [8]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

Next, let's get an overview of the values in the training dataset's columns.

In [9]:
for col in df_train.columns:
    print(f"{col} value counts:")
    print(df_train[col].value_counts())
    print("\n")

id value counts:
id
134    65
155    50
278    49
146    46
265    45
       ..
57      9
87      8
255     7
244     5
231     5
Name: count, Length: 178, dtype: int64


window_id value counts:
window_id
0.0     356
1.0     178
2.0     178
3.0     178
4.0     176
       ... 
59.0      1
60.0      1
61.0      1
62.0      1
63.0      1
Name: count, Length: 64, dtype: int64


BallAcceleration value counts:
BallAcceleration
 0.000000       763
-1393.000000      4
-57.023293        2
 2157.000000      2
-2503.000000      2
               ... 
 1981.432894      1
 716.754644       1
 3070.897867      1
 5204.160485      1
-6024.047940      1
Name: count, Length: 3296, dtype: int64


Time value counts:
Time
0.000000    178
0.382700     12
0.347900      9
0.034800      7
0.417500      6
           ... 
0.835022      1
1.009003      1
1.182963      1
1.704834      1
2.474537      1
Name: count, Length: 3540, dtype: int64


DistanceWall value counts:
DistanceWall
0.00       112
35.99       26
3

Here, we can easily see the distribution of values in the categorical parameters (from the parameter *up* onward). Most of these parameters have an unbalanced distribution. This could possibly mean that they can be used for easier differentiation of trickshots (i.e. if a parameter has one of the more rare values, it is likely that the trickshot belongs to a specific category).

The goal of the task is to predict the *label* column, which is a categorical property with 7 possible values.

In [10]:
df_train.describe()

Unnamed: 0,id,window_id,BallAcceleration,Time,DistanceWall,DistanceCeil,DistanceBall,PlayerSpeed,BallSpeed,up,...,slow,goal,left,boost,camera,down,right,slide,jump,label
count,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,...,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0,4137.0
mean,143.800822,12.346628,-3773.386687,2.021066,3695.554567,1647.599467,267.927304,150372.747353,122822.469763,0.044356,...,0.033358,0.119652,0.01692,0.254774,0.126904,0.012328,0.924583,0.263113,0.403312,3.35533
std,84.764944,10.187281,46138.034227,1.729769,13528.958862,602.30926,895.195107,49072.15042,72784.051268,0.205468,...,0.17959,0.324593,0.128989,0.435787,0.332905,0.110357,0.264095,0.440171,0.490437,2.588084
min,0.0,0.0,-298303.227932,0.0,0.0,0.0,0.0,27.037012,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,70.0,4.0,-1814.0,0.731333,1052.11,1360.59,0.0,122054.835787,85399.2694,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
50%,146.0,10.0,0.0,1.598579,3106.08,1964.6,0.0,148003.167986,128429.573051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0
75%,214.0,18.0,945.444734,2.8879,3754.28,2013.0,0.0,185378.187706,168433.068668,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,6.0
max,297.0,63.0,287269.750948,13.470363,223799.815054,4039.97,9194.156158,229999.958811,309832.16491,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


This output shows us the distribution of values of continuous parameters.

# Model creation

In the process of finding the best prediction model for this task, several different machine learning approaches will be taken into consideration:
- Logistic regression
- Decision tree
- Random forest
- Multi-layer perceptron
- Model ensembles

## Logistic regression

In [11]:
# For training the model, we need to define the feature columns and the target column
feature_cols = list(df_train.columns)
feature_cols.remove('label')

X = df_train[feature_cols]
y = df_train['label']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a simple logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Inference on the validation set
y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.3538647342995169
Confusion matrix:
 [[  9   0  10  10  47  41   0]
 [  2   1  10   1  73  12   0]
 [  4   0  47   9  28  49   0]
 [  6   3   5  10  27  17   0]
 [  9   0  21   6 148  11   0]
 [ 13   0  41   4   7  78   0]
 [  6   0  11   1   0  51   0]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


This model gave a very low score on the validation set and the confusion matrix showed poor differentiation of most classes. This could be due to the fact that multiple rows influence a single trickshot ID.

For this reason, an aggregate dataset will also be trained using logistic regression, to see if there is any improvement.

The rows of the train dataset will be aggregated by ID using majority voting for categorical columns and mean for continuous columns.

In [12]:
categorical_columns = ['window_id', 'up', 'down', 'left', 'right', 'jump', 'slide', 'boost', 'camera', 'label', 'goal', 'accelerate', 'slow']
continuous_columns = ['BallAcceleration', 'Time', 'DistanceWall', 'DistanceCeil', 'PlayerSpeed', 'BallSpeed']

# Aggregate the rows
df_train_aggregated = df_train.groupby('id').agg(lambda x:x.value_counts().index[0] if x.name in categorical_columns else x.mean())
df_train_aggregated.reset_index(inplace=True)

# Train the model on the aggregated dataset
X = df_train_aggregated[feature_cols]
y = df_train_aggregated['label']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a simple logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Inference on the validation set
y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.3888888888888889
Confusion matrix:
 [[3 0 1 1 0 1 0]
 [1 0 1 0 0 0 0]
 [0 0 5 0 0 2 0]
 [2 0 0 2 0 0 0]
 [0 0 0 0 2 0 0]
 [0 0 9 0 0 2 0]
 [0 0 4 0 0 0 0]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


This model gave only slightly better results, which are still not suficient for any real use, so other models will be considered.

In [13]:
# Aggregate the rows
df_test_aggregated = df_test.groupby('id').agg(lambda x:x.value_counts().index[0] if x.name in categorical_columns else x.mean())
df_test_aggregated.reset_index(inplace=True)

# Make predictions on the test dataset
X_test = df_test_aggregated[feature_cols]
y_test = model.predict(X_test)

df_test_logistic_regression = pd.DataFrame({'ID': df_test_aggregated['id'], 'label': y_test})

# Save the predictions to a CSV file
df_test_logistic_regression.to_csv('data/rocketskillshots_test_logistic_regression.csv', index=False)

## Decision tree

In [14]:
# Splitting the data into training and validation sets
X = df_train[feature_cols]
y = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple decision tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Inference on the validation set
y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))


Validation accuracy: 0.9685990338164251
Confusion matrix:
 [[112   2   0   0   1   1   1]
 [  0  98   0   0   0   1   0]
 [  4   2 130   0   0   0   1]
 [  2   0   1  65   0   0   0]
 [  2   0   0   0 193   0   0]
 [  1   0   0   0   0 141   1]
 [  0   0   5   0   0   1  63]]


This model gave significantly better results than logistic regression, showing promise in that method. The accuracy is very high even without hyperparameter-tuning, and in the confusion matrix only a few misclassifications are seen in each class, showing little-to-no bias.

In [15]:
# Train the model on the entire train dataset
model = DecisionTreeClassifier()
model.fit(X, y)

In [16]:
# Make predictions on the test dataset
y_pred_test = model.predict(df_test[feature_cols])

# Create the submission dataframe
df_test_decision_tree = pd.DataFrame({'ID': df_test['id'], 'label': y_pred_test})

# Remove duplicate ID rows via majority voting
df_test_decision_tree = df_test_decision_tree.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_decision_tree.reset_index(inplace=True)

# Save the predictions to a CSV file
df_test_decision_tree.to_csv('data/rocketskillshots_test_decision_tree.csv', index=False)

Because of the model's good performance, hyperparameter tuning will be performed with the goal of further increasing its score.

In [17]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 10, 15, None],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7]
}

# Perform the grid search
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

# Train the best model
best_model = DecisionTreeClassifier(**best_params)
best_model.fit(X_train, y_train)

# Inference on the validation set
y_pred = best_model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.9625603864734299
Confusion matrix:
 [[111   2   1   0   1   1   1]
 [  0  95   3   0   0   0   1]
 [  4   2 130   0   0   0   1]
 [  2   0   1  65   0   0   0]
 [  2   1   0   0 192   0   0]
 [  1   0   0   0   0 141   1]
 [  0   0   5   0   0   1  63]]


In [18]:
# Make predictions on the test dataset
y_test = best_model.predict(df_test[feature_cols])

# Create the submission dataframe
df_test_decision_tree = pd.DataFrame({'ID': df_test['id'], 'label': y_test})

# Remove duplicate ID rows via majority voting
df_test_decision_tree = df_test_decision_tree.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_decision_tree.reset_index(inplace=True)

# Save the predictions to a CSV file
df_test_decision_tree.to_csv('data/rocketskillshots_test_decision_tree_tuned.csv', index=False)

The score did not change significantly after hyperparameter tuning. However, an accuracy of over 96% is still very good for this type of machine learning model.

## Random forest

In [19]:
# Splitting the data into training and validation sets
X = df_train[feature_cols]
y = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a simple random forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Inference on the validation set
y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.9396135265700483
Confusion matrix:
 [[ 97  10   3   0   5   1   1]
 [  1  92   1   0   5   0   0]
 [  1   2 130   0   2   2   0]
 [  0   0   2  64   0   2   0]
 [  0   1   0   0 194   0   0]
 [  0   0   1   0   2 135   5]
 [  0   0   0   0   0   3  66]]


The random forest model also gave very promising results while using the default parameters, and the confusion matrix is similar to that of the decision tree model.

In [20]:
# Train model on entire training dataset
model = RandomForestClassifier()
model.fit(X, y)

# Make predictions on the test dataset
X_test = df_test[feature_cols]
y_test = model.predict(X_test)

# Create the submission dataframe
df_test_random_forest = pd.DataFrame({'ID': df_test['id'], 'label': y_test})

# Remove duplicate ID rows via majority voting
df_test_random_forest = df_test_random_forest.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_random_forest.reset_index(inplace=True)

# Save the predictions to a CSV file
df_test_random_forest.to_csv('data/rocketskillshots_test_random_forest.csv', index=False)

Hyperparameter tuning was also performed for the random forest classifier.

In [21]:
# Create the random grid
random_grid = {'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 4],
               'bootstrap': [True, False]}

# Use the random grid to search for best hyperparameters
model = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, search across 100 different combinations
model_tuned = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
model_tuned.fit(X, y)

model_tuned.best_params_
best_params = model_tuned.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


123 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
57 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Jan\anaconda3\envs\rocketleagueclassifier\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Jan\anaconda3\envs\rocketleagueclassifier\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\Jan\anaconda3\envs\rocketleagueclassifier\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Jan\anaconda3\envs\rocketleagueclassifier\Lib\site-packages\

Next, the tuned model is infered with the test dataset values.

In [22]:
# test the model on the validation set
y_pred = model_tuned.predict(X_val)

print(accuracy_score(y_val, y_pred))

best_model = RandomForestClassifier(**best_params)
best_model.fit(X, y)

# Make predictions on the test dataset
y_test = best_model.predict(X_test)

# Create the submission dataframe
df_test_random_forest = pd.DataFrame({'ID': df_test['id'], 'label': y_test})

# Remove duplicate ID rows via majority voting
df_test_random_forest = df_test_random_forest.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_random_forest.reset_index(inplace=True)

# Save the predictions to a CSV file
df_test_random_forest.to_csv('data/rocketskillshots_test_random_forest_tuned.csv', index=False)

1.0


The random forest classifier was also trained on the aggregated dataset, as was done for logistic regression. It, however didn't give adequate results.

In [23]:
# Aggregate the rows of the train dataset by ID using the majority voting method for categorical columns and the mean for continuous columns.
categorical_columns = ['window_id', 'up', 'down', 'left', 'right', 'jump', 'slide', 'boost', 'camera', 'label', 'goal', 'accelerate', 'slow']
continuous_columns = ['BallAcceleration', 'Time', 'DistanceWall', 'DistanceCeil', 'PlayerSpeed', 'BallSpeed']

df_train_aggregated = df_train.groupby('id').agg(lambda x:x.value_counts().index[0] if x.name in categorical_columns else x.mean())
df_train_aggregated.reset_index(inplace=True)

# Train a random forest model on the aggregated dataset
X = df_train_aggregated[feature_cols]
y = df_train_aggregated['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.6388888888888888
Confusion matrix:
 [[2 0 1 1 0 2 0]
 [0 0 0 0 2 0 0]
 [0 0 6 0 0 0 1]
 [0 0 0 4 0 0 0]
 [0 0 0 0 2 0 0]
 [0 0 2 0 0 7 2]
 [0 0 1 0 0 1 2]]


In the initial data preprocessing, the missing values (NaNs) were replaced with zeros. This however is only one of the possible ways of dealing with missing values.

So, next, the missing values were filled with the column's mean before training the random forest classifier again.

In [24]:
# Load the train and test datasets
df_train_interpolated = pd.read_csv('data/rocketskillshots_train.csv')
df_test_interpolated = pd.read_csv('data/rocketskillshots_test.csv')

# Drop the skew columns
df_train_interpolated = df_train_interpolated.loc[:,~df_train_interpolated.columns.str.endswith('_skew')]
df_test_interpolated = df_test_interpolated.loc[:,~df_test_interpolated.columns.str.endswith('_skew')]

# Drop the DistanceBall column
df_train_interpolated.drop('DistanceBall', axis=1, inplace=True)
df_test_interpolated.drop('DistanceBall', axis=1, inplace=True)

# Fill missing values with the mean of the column
df_train_interpolated.fillna(df_train_interpolated.mean(), inplace=True)
df_test_interpolated.fillna(df_test_interpolated.mean(), inplace=True)

In [25]:
# Splitting the data into training and validation sets
feature_cols = list(df_train_interpolated.columns)
feature_cols.remove('label')

X = df_train_interpolated[feature_cols]
y = df_train_interpolated['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a simple random forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Inference on the validation set
y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.9565217391304348
Confusion matrix:
 [[104   7   1   0   3   1   1]
 [  2  95   2   0   0   0   0]
 [  3   0 131   0   1   1   1]
 [  0   0   2  65   0   1   0]
 [  1   0   1   0 191   0   2]
 [  0   0   0   0   3 139   1]
 [  0   0   0   0   0   2  67]]


In [26]:
# Train model on entire training dataset
model = RandomForestClassifier()
model.fit(X, y)

# Make predictions on the test dataset
X_test = df_test_interpolated[feature_cols]
y_test = model.predict(X_test)

# Create the submission dataframe
df_test_random_forest = pd.DataFrame({'ID': df_test_interpolated['id'], 'label': y_test})

# Remove duplicate ID rows via majority voting
df_test_random_forest = df_test_random_forest.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_random_forest.reset_index(inplace=True)

# Save the predictions to a CSV file
df_test_random_forest.to_csv('data/rocketskillshots_test_random_forest_interpolated.csv', index=False)

This way of dealing with missing values resulted in similar performance, both on the validation set here, and on the test dataset on Kaggle.

## Multi-layer perceptron

In [27]:
# Train-test split
X = df_train[feature_cols]
y = df_train['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = MLPClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.357487922705314
Confusion matrix:
 [[ 37  13   2   6  39  19   1]
 [ 17  18   2   5  43   5   9]
 [ 22   6   5   4  49  29  22]
 [  6  11   0  29   6   9   7]
 [ 21  11   6   4 118  30   5]
 [ 10   0   0   1  26  72  34]
 [  6   1   0   0   5  40  17]]


The neural network gave a very low accuracy and the confusion matrix showed poor differentiation of classes.

It was also trained on the aggregate dataset (in the next cell), but the results were similar.

Consequently, this model was not further tuned.

In [28]:
# Train a neural network model on the aggregated dataset
X = df_train_aggregated[feature_cols]
y = df_train_aggregated['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = MLPClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

Validation accuracy: 0.2222222222222222
Confusion matrix:
 [[ 0  0  5  0  0  1  0]
 [ 0  0  2  0  0  0  0]
 [ 0  0  7  0  0  0  0]
 [ 1  0  1  1  1  0  0]
 [ 0  0  2  0  0  0  0]
 [ 0  0 11  0  0  0  0]
 [ 0  0  4  0  0  0  0]]


## Model ensembles

Model ensembles can be a good way to create a system capable of getting high accuracy.

Two model ensemble architectures were tested:
1. logistic regression model + decision tree classifier + random forest classifier,
2. three different random forest classifiers.

In [29]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()

model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('rf', model3)], voting='hard')

# Train-test split
X = df_train[feature_cols]
y = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print()
print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

# Train ensemble model on the entire train dataset
X = df_train[feature_cols]
y = df_train['label']

model.fit(X, y)

# Make predictions on the test dataset
X_test = df_test[feature_cols]
y_test = model.predict(X_test)

df_test_ensemble = pd.DataFrame({'ID': df_test['id'], 'label': y_test})

# Remove duplicate ID rows via majority voting
df_test_ensemble = df_test_ensemble.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_ensemble.reset_index(inplace=True)

df_test_ensemble.to_csv('data/rocketskillshots_test_ensemble_1.csv', index=False)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Validation accuracy: 0.9420289855072463
Confusion matrix:
 [[109   0   2   0   5   1   0]
 [  2  93   0   0   4   0   0]
 [  4   5 126   0   1   1   0]
 [  2   0   2  63   0   1   0]
 [  3   0   1   0 191   0   0]
 [  2   0   3   0   0 137   1]
 [  2   0   6   0   0   0  61]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
# Try another ensemble model
from sklearn.ensemble import VotingClassifier

model1 = RandomForestClassifier(n_estimators=50, max_depth=20)
model2 = RandomForestClassifier(n_estimators=100, max_depth=10)
model3 = RandomForestClassifier(n_estimators=200, max_depth=30)

model = VotingClassifier(estimators=[('rf1', model1), ('rf2', model2), ('rf3', model3)], voting='hard')

# Train-test split
X = df_train[feature_cols]
y = df_train['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print()
print("Validation accuracy:", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

# Train ensemble model on the entire train dataset
X = df_train[feature_cols]
y = df_train['label']

model.fit(X, y)

# Make predictions on the test dataset
X_test = df_test[feature_cols]
y_test = model.predict(X_test)

df_test_ensemble = pd.DataFrame({'ID': df_test['id'], 'label': y_test})

# Remove duplicate ID rows via majority voting
df_test_ensemble = df_test_ensemble.groupby('ID').agg(lambda x:x.value_counts().index[0])
df_test_ensemble.reset_index(inplace=True)

df_test_ensemble.to_csv('data/rocketskillshots_test_ensemble_2.csv', index=False)


Validation accuracy: 0.9480676328502415
Confusion matrix:
 [[103   9   1   0   2   0   2]
 [  0  95   0   0   4   0   0]
 [  2   1 131   0   2   1   0]
 [  0   0   2  65   0   1   0]
 [  0   1   1   0 193   0   0]
 [  0   0   1   0   4 138   0]
 [  1   0   5   0   0   3  60]]


Both model ensembles gave relatively high accuracies on the validation sets, around 95%, which is similar to the random forest and decision tree classifiers.

# Conclusion

We were tasked with creating a machine learning model capable of predicting a trickshot in the videogame Rocket League.

The first step was exploring the dataset, getting summaries of the data, and dealing with missing values. This part was explained in detail at the beggining of the notebook.

After that, five different model architectures were trained and tested:
1. Logistic regression
2. Decision tree classifier
3. Random forest classifier
4. Multi-layer perceptron
5. Model ensembles

All models were trained using either 80/20 train-validation splits or using cross-validation. They were evaluated using accuracy and confusion matrices. 

Decision tree and random forest classifiers performed best on the validation datasets, so they were tuned and explored further, with the goal of increasing accuracy.

In the end, an accuracy of around 95% on the validation sets and around 90% on the Kaggle test dataset was achieved using those two methods.

As a result of this project, the fine-tuned random forest classifier was chosen as the optimal model for this task because of its high accuracy, both on the validation sets and on the test dataset.