In [170]:
import torch
from torch import nn
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [197]:
class Config():
    input_paths =  ['2023-wimbledon-1313-player1.csv',
                    '2023-wimbledon-1407-player1.csv',
                    '2023-wimbledon-1504-player1.csv',
                    ]
    match_id = []
    timestep = 10
    batch_size = 64
    feature_size = 12#16
    hidden_size = 10 # TODO: can change
    output_size = 4
    predict_len = 5
    num_layers = 2 # TODO: can change
    epochs = 10 # TODO: can change
    best_loss = 100
    learning_rate = 0.005 # TODO: can change
    model_name = 'GRU_for_tennis_momentum_swing_for_andrey_rublev'
    save_path = './trained_models/{}.pth'.format(model_name)

config = Config()

In [172]:
import csv

def delete_columns(input_file, output_file, columns_to_delete):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        for row in reader:
            # Exclude columns to delete (using 0-based index)
            filtered_row = [value for idx, value in enumerate(row) if idx not in columns_to_delete]
            writer.writerow(filtered_row)

# Example usage:
input_csv_file = config.input_paths[0]
output_csv_file = 'your_output_file.csv'
columns_to_delete = [10, 11, 14, 15]  # Columns are 0-based, so 11 becomes 10, 12 becomes 11, and so on.
for input_path in config.input_paths:
    delete_columns('./data_for_training/data/'+input_path, './data_for_training/data1/'+input_path, columns_to_delete)

In [198]:

def fetch_data(dataX, dataY, flag, timestep, feature_size, output_size, pred_len):
    myDataX = []
    myDataY = []
    for i in range(0, len(dataX) - timestep):
        target_row = dataY[i + timestep - 1].reshape(-1, output_size)
        res = []
        for j in range(pred_len):
            for k in range(output_size):
                if target_row[j][k] == 1:
                    if flag:
                        res.append(2 - k if k % 2 == 0 else 4 - k)
                    else:
                        res.append(k)
                    break
        if len(res) != pred_len:
            break
        myDataX.append(dataX[i: i + timestep])
        myDataY.append(res)
    myDataX = np.array(myDataX)
    myDataY = np.array(myDataY)

    train_size = int(np.round(0.8 * myDataX.shape[0]))
    trainX = myDataX[:train_size, :].reshape(-1, timestep, feature_size)
    testX = myDataX[train_size:, :].reshape(-1, timestep, feature_size)
    trainY = myDataY[:train_size, :].reshape(-1, pred_len)
    testY = myDataY[train_size:, :].reshape(-1, pred_len)
    return (trainX, trainY, testX, testY)

trainX = []
trainY = []
testX = []
testY = []
dfy = pd.read_csv('./data_for_training/momentum_condition.csv')
dataY = np.array(dfy)
# match_id: (start position, end position, 0 if player == player1 else)
# match_id = [(0, 300, 0), (3788, 4013, 0), (5707, 5896, 0), (6589, 6748, 0), (6950, 7284, 0)] # for carlos_alcaraz
# match_id = [(1972, 2185, 0), (4695, 4910, 0), (6179, 6372, 0), (6748, 6950, 0)] # for jannik sinner
match_id = [(2948, 3238, 0), (5105, 5436, 0), (6372, 6589, 0)] # for andrey rublev
match_no = 0

for input_path in config.input_paths:
    input_path = './data_for_training/data1/' + input_path
    dfx = pd.read_csv(input_path)
    dataX = np.array(dfx)
    l, r, flag = match_id[match_no]
    match_no += 1
    a, b, c, d = fetch_data(dataX, dataY[l: r], flag, config.timestep, config.feature_size, config.output_size, config.predict_len)
    
    for x in a:
        trainX.append(x)
    for y in b:
        trainY.append(y)
    for x in c:
        testX.append(x)
    for y in d:
        testY.append(y)
trainX = np.array(trainX).reshape(-1, config.timestep, config.feature_size)
testX = np.array(testX).reshape(-1, config.timestep, config.feature_size)
trainY = np.array(trainY).reshape(-1, config.predict_len)
testY = np.array(testY).reshape(-1, config.predict_len)

x_train_tensor = torch.from_numpy(trainX).to(torch.float32)
y_train_tensor = torch.from_numpy(trainY).to(torch.long)

x_test_tensor = torch.from_numpy(testX).to(torch.float32)
y_test_tensor = torch.from_numpy(testY).to(torch.long)

train_data = TensorDataset(x_train_tensor, y_train_tensor)
test_data = TensorDataset(x_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, config.batch_size, False)
test_loader = DataLoader(test_data, config.batch_size, False)

In [174]:
# GRU
class GRURNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(self.input_size, self.hidden_size, self.num_layers, batch_first=True)
        self.fc1 = nn.Linear(self.hidden_size, output_size)
        self.fc2 = nn.Linear(self.hidden_size, output_size)
        self.fc3 = nn.Linear(self.hidden_size, output_size)
        self.fc4 = nn.Linear(self.hidden_size, output_size)
        self.fc5 = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
 
    def forward(self, input_seq):
        batch_size = input_seq.shape[0]
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        output, _ = self.gru(input_seq,h_0)
        pred1 = self.fc1(output)
        pred2 = self.fc2(output)
        pred3 = self.fc3(output)
        pred4 = self.fc4(output)
        pred5 = self.fc5(output)
        pred1, pred2, pred3, pred4, pred5 = pred1[:, -1, :], pred2[:, -1, :], pred3[:, -1, :], pred4[:, -1, :], pred5[:, -1, :]
        pred1, pred2, pred3, pred4, pred5 = self.softmax(pred1), self.softmax(pred2), self.softmax(pred3), self.softmax(pred4), self.softmax(pred5)
        pred = torch.stack([pred1, pred2, pred3, pred4, pred5], dim=1)
        return pred

In [175]:

model = GRURNN(config.feature_size, config.hidden_size, config.num_layers, config.output_size)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

In [179]:
for epoch in range(config.epochs):
    model.train()
    running_loss = 0
    train_bar = tqdm(train_loader)
    for data in train_bar:
        x_train, y_train = data
        optimizer.zero_grad()
        y_train_pred = model(x_train)

        loss1 = loss_function(y_train_pred[:, 0, :], y_train[:, 0])
        loss2 = loss_function(y_train_pred[:, 1, :], y_train[:, 1])
        loss3 = loss_function(y_train_pred[:, 2, :], y_train[:, 2])
        loss4 = loss_function(y_train_pred[:, 3, :], y_train[:, 3])
        loss5 = loss_function(y_train_pred[:, 4, :], y_train[:, 4])
        # loss = max(loss1, loss2, loss3, loss4, loss5)
        loss = loss1 + loss2 + loss3 + loss4 + loss5
        # loss = 5 * loss1 + 2 * loss2 + loss3 + loss4 + loss5
        
        loss.backward()
        optimizer.step()
        # print("loss =", loss)
        
        running_loss += loss.item()
        train_bar.desc = 'train epoch[{}/{}] loss: {:.3f}'.format(epoch + 1, config.epochs, loss)
    
    # model.eval()
    # test_loss = 0
    # with torch.no_grad():
    #     test_bar = tqdm(test_loader)
    #     for data in test_bar:
    #         x_test, y_test = data
    #         y_test_pred = model(x_test)
    #         loss1 = loss_function(y_test_pred[:, 0, :], y_test[:, 0])
    #         loss2 = loss_function(y_test_pred[:, 1, :], y_test[:, 1])
    #         loss3 = loss_function(y_test_pred[:, 2, :], y_test[:, 2])
    #         loss4 = loss_function(y_test_pred[:, 3, :], y_test[:, 3])
    #         loss5 = loss_function(y_test_pred[:, 4, :], y_test[:, 4])
    #         # test_loss = max(loss1, loss2, loss3, loss4, loss5)
    #         test_loss = loss1 + loss2 + loss3 + loss4 + loss5
    #         # test_loss = 5 * loss1 + 3 * loss2 + loss3 + loss4 + loss5
    
    # if test_loss < config.best_loss:
    #     config.best_loss = test_loss
    #     torch.save(model.state_dict(), config.save_path)


train epoch[1/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 197.60it/s]
train epoch[2/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 210.54it/s]
train epoch[3/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 170.18it/s]
train epoch[4/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 254.26it/s]
train epoch[5/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 220.24it/s]
train epoch[6/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 204.39it/s]
train epoch[7/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 159.49it/s]
train epoch[8/10] loss: 6.855: 100%|███████████████████████████████████████████████████| 10/10 [00:00<00:00, 227.41it/s]
train epoch[9/10] loss: 6.855: 1

In [180]:
model = GRURNN(config.feature_size, config.hidden_size, config.num_layers, config.output_size)
model.load_state_dict(torch.load(config.save_path))

model.eval()
with torch.no_grad():
    train_bar = tqdm(train_loader)

    predict_rate = [0 for _ in range(config.predict_len)]
    tot = [0 for _ in range(config.predict_len)]
    for data in train_bar:
        x_train, y_train = data
        y_train_pred = model(x_train)
        for i in range(config.predict_len):
            tot[i] += x_train.shape[0]
        for i in range(x_train.shape[0]):
            for j in range(config.predict_len):
                pred = 0
                for k in range(config.output_size):
                    if y_train_pred[i][j][k].item() > y_train_pred[i][j][pred].item():
                        pred = k
                predict_rate[j] += (1 if pred == y_train[i][j] else 0)
    for i in range(config.predict_len):
        print('train set predict rate for step {:d} is: {:f}'.format(i + 1, predict_rate[i] / tot[i]))
    test_bar = tqdm(test_loader)

    predict_rate = [0 for _ in range(config.predict_len)]
    tot = [0 for _ in range(config.predict_len)]
    for data in test_bar:
        x_test, y_test = data
        y_test_pred = model(x_test)
        for i in range(config.predict_len):
            tot[i] += x_test.shape[0]
        for i in range(x_test.shape[0]):
            for j in range(config.predict_len):
                pred = 0
                for k in range(config.output_size):
                    if y_test_pred[i][j][k].item() > y_test_pred[i][j][pred].item():
                        pred = k
                predict_rate[j] += (1 if pred == y_test[i][j] else 0)
    for i in range(config.predict_len):
        print('test set predict rate for step {:d} is: {:f}'.format(i + 1, predict_rate[i] / tot[i]))

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 96.80it/s]


train set predict rate for step 1 is: 0.422047
train set predict rate for step 2 is: 0.390551
train set predict rate for step 3 is: 0.384252
train set predict rate for step 4 is: 0.387402
train set predict rate for step 5 is: 0.376378


100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 98.74it/s]

test set predict rate for step 1 is: 0.436709
test set predict rate for step 2 is: 0.392405
test set predict rate for step 3 is: 0.386076
test set predict rate for step 4 is: 0.449367
test set predict rate for step 5 is: 0.398734





In [178]:
torch.save(model.state_dict(), config.save_path)

In [183]:

def compute_prediction(model, output_file):
    model.eval()
    
    lst = []
    column = ['is_train_set', 'pred_y1', 'pred_y2', 'pred_y3', 'pred_y4', 'pred_y5', 'target_y1', 'target_y2', 'target_y3', 'target_y4', 'target_y5']
    with torch.no_grad():
        train_bar = tqdm(train_loader)

        predict_rate = [0 for _ in range(config.predict_len)]
        tot = [0 for _ in range(config.predict_len)]
        for data in train_bar:
            x_train, y_train = data
            y_train_pred = model(x_train)
            for i in range(config.predict_len):
                tot[i] += x_train.shape[0]
            for i in range(x_train.shape[0]):
                row = [1]
                for j in range(config.predict_len):
                    pred = 0
                    for k in range(config.output_size):
                        if y_train_pred[i][j][k].item() > y_train_pred[i][j][pred].item():
                            pred = k
                    predict_rate[j] += (1 if pred == y_train[i][j] else 0)
                    row.append(pred)
                for j in range(config.predict_len):
                    row.append(y_train[i][j].item())
                lst.append(row)
        for i in range(config.predict_len):
            print('train set predict rate for step {:d} is: {:f}'.format(i + 1, predict_rate[i] / tot[i]))
        
        test_bar = tqdm(test_loader)

        predict_rate = [0 for _ in range(config.predict_len)]
        tot = [0 for _ in range(config.predict_len)]
        for data in test_bar:
            x_test, y_test = data
            y_test_pred = model(x_test)
            for i in range(config.predict_len):
                tot[i] += x_test.shape[0]
            for i in range(x_test.shape[0]):
                row = [0]
                for j in range(config.predict_len):
                    pred = 0
                    for k in range(config.output_size):
                        if y_test_pred[i][j][k].item() > y_test_pred[i][j][pred].item():
                            pred = k
                    predict_rate[j] += (1 if pred == y_test[i][j] else 0)
                    row.append(pred)
                for j in range(config.predict_len):
                    row.append(y_test[i][j].item())
                lst.append(row)
        for i in range(config.predict_len):
            print('test set predict rate for step {:d} is: {:f}'.format(i + 1, predict_rate[i] / tot[i]))
    df = pd.DataFrame(lst, columns = column)
    df.to_csv(output_file)

model_alcaraz = GRURNN(config.feature_size, config.hidden_size, config.num_layers, config.output_size)
model_sinner = GRURNN(config.feature_size, config.hidden_size, config.num_layers, config.output_size)
model_rublev = GRURNN(config.feature_size, config.hidden_size, config.num_layers, config.output_size)

model_alcaraz.load_state_dict(torch.load('./trained_models/GRU_for_tennis_momentum_swing_for_carlos_alcaraz.pth'))
model_sinner.load_state_dict(torch.load('./trained_models/GRU_for_tennis_momentum_swing_for_jannik_sinner.pth'))
model_rublev.load_state_dict(torch.load('./trained_models/GRU_for_tennis_momentum_swing_for_andrey_rublev.pth'))

<All keys matched successfully>

In [191]:
print('-----------alcaraz------------')
compute_prediction(model_alcaraz, './pred_res/model_alcaraz.csv')


-----------alcaraz------------


100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 89.74it/s]


train set predict rate for step 1 is: 0.439779
train set predict rate for step 2 is: 0.398895
train set predict rate for step 3 is: 0.401105
train set predict rate for step 4 is: 0.425414
train set predict rate for step 5 is: 0.416575


100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 80.93it/s]

test set predict rate for step 1 is: 0.405286
test set predict rate for step 2 is: 0.383260
test set predict rate for step 3 is: 0.356828
test set predict rate for step 4 is: 0.352423
test set predict rate for step 5 is: 0.352423





In [187]:
print('-----------sinner-------------')
compute_prediction(model_sinner, './pred_res/model_sinner.csv')


-----------sinner-------------


100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 93.92it/s]


train set predict rate for step 1 is: 0.522951
train set predict rate for step 2 is: 0.537705
train set predict rate for step 3 is: 0.521311
train set predict rate for step 4 is: 0.498361
train set predict rate for step 5 is: 0.483607


100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 45.77it/s]

test set predict rate for step 1 is: 0.424837
test set predict rate for step 2 is: 0.405229
test set predict rate for step 3 is: 0.379085
test set predict rate for step 4 is: 0.372549
test set predict rate for step 5 is: 0.339869





In [184]:
print('-----------rublev-------------')
compute_prediction(model_rublev, './pred_res/model_rublev.csv')

-----------rublev-------------


100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 81.39it/s]


train set predict rate for step 1 is: 0.422047
train set predict rate for step 2 is: 0.390551
train set predict rate for step 3 is: 0.384252
train set predict rate for step 4 is: 0.387402
train set predict rate for step 5 is: 0.376378


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 106.48it/s]

test set predict rate for step 1 is: 0.436709
test set predict rate for step 2 is: 0.392405
test set predict rate for step 3 is: 0.386076
test set predict rate for step 4 is: 0.449367
test set predict rate for step 5 is: 0.398734





In [192]:
def calcPredictRate(model, test_loader):
    model.eval()
    predict_rate = [0 for _ in range(config.predict_len)]
    tot = [0 for _ in range(config.predict_len)]
    with torch.no_grad():
        
        test_bar = tqdm(test_loader)

        for data in test_bar:
            x_test, y_test = data
            y_test_pred = model(x_test)
            for i in range(config.predict_len):
                tot[i] += x_test.shape[0]
            for i in range(x_test.shape[0]):
                for j in range(config.predict_len):
                    pred = 0
                    for k in range(config.output_size):
                        if y_test_pred[i][j][k].item() > y_test_pred[i][j][pred].item():
                            pred = k
                    predict_rate[j] += (1 if pred == y_test[i][j] else 0)
    return np.array([predict_rate[i] / tot[i] for i in range(config.predict_len)])

def calcFeatureImportance(model):
    accuracy = calcPredictRate(model, test_loader)
    importance = np.zeros(config.feature_size)
    for i in range(config.feature_size):
        ntestX = testX.copy()
        nfeature = ntestX[:, :, i].copy().reshape(-1)
        np.random.shuffle(nfeature)
        nfeature = nfeature.reshape(-1, config.timestep)
        for j in range(ntestX.shape[0]):
            for k in range(ntestX.shape[1]):
                ntestX[j][k][i] = nfeature[j][k]
        nx_test_tensor = torch.from_numpy(ntestX).to(torch.float32)
        ntest_data = TensorDataset(nx_test_tensor, y_test_tensor)
        ntest_loader = DataLoader(ntest_data, config.batch_size, False)
        naccuracy = calcPredictRate(model, ntest_loader)
        importance[i] = sum([(accuracy[j] - naccuracy[j]) * (accuracy[j] - naccuracy[j]) for j in range(config.predict_len)])

    tot = np.sum(importance)
    for i in range(importance.shape[0]):
        importance[i] = importance[i] / tot
    
    return importance

column_name = ['model_name'] + ['f' + str(i + 1) for i in range(config.feature_size)]
z = []

In [193]:
z.append(['alcaraz'] + calcFeatureImportance(model_alcaraz).tolist())

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 103.38it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 99.85it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 99.29it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 118.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 97.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 97.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 100.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 99.07it/s]
100%|███████████████████████████

In [196]:
z.append(['sinner'] + calcFeatureImportance(model_sinner).tolist())

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 116.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 108.45it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 105.85it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 101.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 111.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 139.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 101.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 121.50it/s]
100%|███████████████████████████

In [199]:
z.append(['rublev'] + calcFeatureImportance(model_rublev).tolist())

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 118.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 109.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 103.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 94.41it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 105.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 101.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 121.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 110.53it/s]
100%|███████████████████████████

In [200]:
print(z)
importance = pd.DataFrame(data=z, columns=column_name)
importance.to_csv('./trained_models/feature_importance_for_specific_players.csv')

[['alcaraz', 0.02459016393442617, 0.06557377049180332, 0.01844262295081967, 0.4856557377049182, 0.07172131147540987, 0.07991803278688532, 0.08196721311475412, 0.04303278688524589, 0.0819672131147542, 0.02049180327868845, 0.006147540983606608, 0.020491803278688343], ['sinner', 0.07922912205567448, 0.012847965738758054, 0.01605995717344752, 0.2944325481798716, 0.16059957173447548, 0.16809421841541752, 0.10064239828693802, 0.022483940042826528, 0.08565310492505356, 0.0032119914346894858, 0.020342612419700205, 0.03640256959314774], ['rublev', 0.05651672433679354, 0.0034602076124567453, 0.025374855824682872, 0.5351787773933102, 0.051903114186851285, 0.10957324106113032, 0.018454440599769282, 0.06459054209919253, 0.03114186851211069, 0.061130334486735806, 0.006920415224913491, 0.035755478662053065]]
