In [135]:
import pandas as pd
import numpy as np
import torch
import sklearn

cuda_available = torch.cuda.is_available()

In [3]:
season2018 = pd.read_csv("Season2018.csv")
season2019 = pd.read_csv("Season2019.csv")
season2020 = pd.read_csv("Season2020.csv")

In [4]:
np.random.seed(42)
season2018_test = np.random.permutation(season2018["fixture_id"])[:48]
season2019_test = np.random.permutation(season2019["fixture_id"])[:48]
season2020_test = np.random.permutation(season2020["fixture_id"])[:48]

In [5]:
test_fixtures = np.append(np.append(season2018_test, season2019_test), season2020_test)

In [6]:
df = pd.read_csv("SaudiLeague.csv")

In [7]:
df.head()

Unnamed: 0,fixture_id,team,formation,coach,players,isHome,rating,opponent,opponent_formation,opponent_coach,...,opponent_last_against_performance,pts_difference,pts_after,opponent_pts_after,top_scorrer,main_gk,main_captain,opponent_top_scorrer,opponent_main_gk,opponent_main_captain
0,151695,Al-Ettifaq,4-2-3-1,L. Ramos,"R. M'Bolhi, 'R. Arias', 'Hussein Al Sayed', 'S...",1,7.0,Al-Raed,4-2-3-1,B. Hasi,...,6.72,0,1,1,1,1,0,1,0,0
1,151695,Al-Raed,4-2-3-1,B. Hasi,"'A. Doukha', 'H. Belkaroui', 'Mohammed Al Amri...",0,6.98,Al-Ettifaq,4-2-3-1,L. Ramos,...,6.9,0,1,1,1,0,0,1,1,0
2,151696,Al-Hazm,4-2-3-1,D. Isăilă,"'M. Asselah', 'Alemão', 'Khaled Al Barakah', '...",1,6.97,Al Wehda Club,4-4-2,Fábio Carille,...,7.02,0,1,1,0,0,0,1,0,0
3,151696,Al Wehda Club,4-4-2,Fábio Carille,"'Mohamed Awad', 'Osama Hawsawi', 'Renato Chave...",0,6.77,Al-Hazm,4-2-3-1,D. Isăilă,...,6.81,0,1,1,1,0,0,0,0,0
4,151697,Ohod,4-2-3-1,F. Arce,"'Z. Laaroubi', 'Hussein Abdul Ghani', 'Apodi',...",1,6.64,Al-Nassr,4-2-3-1,D. Carreño,...,7.07,0,0,3,1,0,0,0,0,0


In [8]:
df.columns

Index(['fixture_id', 'team', 'formation', 'coach', 'players', 'isHome',
       'rating', 'opponent', 'opponent_formation', 'opponent_coach',
       'opponent_players', 'opponent_rating', 'season', 'date', 'venue',
       'status', 'round', 'team_goals', 'opponent_goals', 'pts',
       'opponent_pts', 'result', 'last_performance',
       'opponent_last_performance', 'last_against_performance',
       'opponent_last_against_performance', 'pts_difference', 'pts_after',
       'opponent_pts_after', 'top_scorrer', 'main_gk', 'main_captain',
       'opponent_top_scorrer', 'opponent_main_gk', 'opponent_main_captain'],
      dtype='object')

Let's perform all preprocessing steps then export the data into seperate CSVs

* What actions do we want to make on the data before modeling?
    1. Drop unnecessary columns 
    2. Scale numeric variables 
    3. Create dummy variables for categorical variables

##### 1.  dropping columns (columns that are not significant or can not be collected before the match)

In [34]:
df.columns

Index(['fixture_id', 'team', 'formation', 'coach', 'isHome', 'opponent',
       'opponent_formation', 'opponent_coach', 'season', 'round', 'team_goals',
       'opponent_goals', 'pts', 'opponent_pts', 'last_performance',
       'opponent_last_performance', 'last_against_performance',
       'opponent_last_against_performance', 'pts_difference', 'top_scorrer',
       'main_gk', 'main_captain', 'opponent_top_scorrer', 'opponent_main_gk',
       'opponent_main_captain'],
      dtype='object')

In [32]:
df.drop(columns=["players", "rating", "opponent_players", "opponent_rating", "date", "venue",
                'status', "result", "pts_after","opponent_pts_after"], inplace=True)

In [33]:
df.columns

Index(['fixture_id', 'team', 'formation', 'coach', 'isHome', 'opponent',
       'opponent_formation', 'opponent_coach', 'season', 'round', 'team_goals',
       'opponent_goals', 'pts', 'opponent_pts', 'last_performance',
       'opponent_last_performance', 'last_against_performance',
       'opponent_last_against_performance', 'pts_difference', 'top_scorrer',
       'main_gk', 'main_captain', 'opponent_top_scorrer', 'opponent_main_gk',
       'opponent_main_captain'],
      dtype='object')

##### 2. Numeric variables scaling (based on metrics from the training data)

In [59]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

#####  let's define the training data to use it for fitting the scaler 

In [57]:
train_data = df.loc[~df["fixture_id"].isin(test_fixtures)]

In [58]:
train_data.shape

(1152, 25)

In [60]:
train_data[["round", "pts", "opponent_pts", "last_performance",
            "opponent_last_performance",'last_against_performance', 'opponent_last_against_performance',
            'pts_difference',]] = min_max_scaler.fit_transform(train_data[["round", "pts", "opponent_pts",
                                                      "last_performance","opponent_last_performance",
                                                      'last_against_performance', 'pts_difference',
                                                      'opponent_last_against_performance']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [51]:
train_data.head()

Unnamed: 0,fixture_id,team,formation,coach,isHome,opponent,opponent_formation,opponent_coach,season,round,...,opponent_last_performance,last_against_performance,opponent_last_against_performance,pts_difference,top_scorrer,main_gk,main_captain,opponent_top_scorrer,opponent_main_gk,opponent_main_captain
0,151695,Al-Ettifaq,4-2-3-1,L. Ramos,1,Al-Raed,4-2-3-1,B. Hasi,2018/2019,0.0,...,0.55814,0.610465,0.5,0.505814,1,1,0,1,0,0
1,151695,Al-Raed,4-2-3-1,B. Hasi,0,Al-Ettifaq,4-2-3-1,L. Ramos,2018/2019,0.0,...,0.517442,0.505814,0.5,0.610465,1,0,0,1,1,0
2,151696,Al-Hazm,4-2-3-1,D. Isăilă,1,Al Wehda Club,4-4-2,Fábio Carille,2018/2019,0.0,...,0.587209,0.55814,0.5,0.680233,0,0,0,1,0,0
3,151696,Al Wehda Club,4-4-2,Fábio Carille,0,Al-Hazm,4-2-3-1,D. Isăilă,2018/2019,0.0,...,0.563953,0.680233,0.5,0.55814,1,0,0,0,0,0
4,151697,Ohod,4-2-3-1,F. Arce,1,Al-Nassr,4-2-3-1,D. Carreño,2018/2019,0.0,...,0.674419,0.389535,0.5,0.709302,1,0,0,0,0,0


In [61]:
df[["round", "pts", "opponent_pts", "last_performance",
            "opponent_last_performance",'last_against_performance', 'opponent_last_against_performance',
            'pts_difference']] = min_max_scaler.transform(df[["round", "pts", "opponent_pts",
                                                      "last_performance","opponent_last_performance",
                                                      'last_against_performance', 'pts_difference',
                                                      'opponent_last_against_performance']])

In [62]:
df.head()

Unnamed: 0,fixture_id,team,formation,coach,isHome,opponent,opponent_formation,opponent_coach,season,round,...,opponent_last_performance,last_against_performance,opponent_last_against_performance,pts_difference,top_scorrer,main_gk,main_captain,opponent_top_scorrer,opponent_main_gk,opponent_main_captain
0,151695,Al-Ettifaq,4-2-3-1,L. Ramos,1,Al-Raed,4-2-3-1,B. Hasi,2018/2019,0.0,...,0.55814,0.610465,0.5,0.505814,1,1,0,1,0,0
1,151695,Al-Raed,4-2-3-1,B. Hasi,0,Al-Ettifaq,4-2-3-1,L. Ramos,2018/2019,0.0,...,0.517442,0.505814,0.5,0.610465,1,0,0,1,1,0
2,151696,Al-Hazm,4-2-3-1,D. Isăilă,1,Al Wehda Club,4-4-2,Fábio Carille,2018/2019,0.0,...,0.587209,0.55814,0.5,0.680233,0,0,0,1,0,0
3,151696,Al Wehda Club,4-4-2,Fábio Carille,0,Al-Hazm,4-2-3-1,D. Isăilă,2018/2019,0.0,...,0.563953,0.680233,0.5,0.55814,1,0,0,0,0,0
4,151697,Ohod,4-2-3-1,F. Arce,1,Al-Nassr,4-2-3-1,D. Carreño,2018/2019,0.0,...,0.674419,0.389535,0.5,0.709302,1,0,0,0,0,0


##### 3. dummy variables

In [63]:
df.columns

Index(['fixture_id', 'team', 'formation', 'coach', 'isHome', 'opponent',
       'opponent_formation', 'opponent_coach', 'season', 'round', 'team_goals',
       'opponent_goals', 'pts', 'opponent_pts', 'last_performance',
       'opponent_last_performance', 'last_against_performance',
       'opponent_last_against_performance', 'pts_difference', 'top_scorrer',
       'main_gk', 'main_captain', 'opponent_top_scorrer', 'opponent_main_gk',
       'opponent_main_captain'],
      dtype='object')

In [66]:
df.shape

(1440, 25)

In [64]:
dummy = pd.get_dummies(df[['team', 'formation', "coach", "opponent",
                           "opponent_formation", "opponent_coach","season"]])

In [65]:
dummy

Unnamed: 0,team_Abha,team_Al Baten,team_Al Shabab,team_Al Taawon,team_Al Wehda Club,team_Al-Adalah,team_Al-Ahli Jeddah,team_Al-Ain,team_Al-Ettifaq,team_Al-Faisaly FC,...,opponent_coach_V. Milojević,opponent_coach_Vítor Campelos,opponent_coach_Y. Al Mannai,opponent_coach_Y. Ferrera,opponent_coach_Yousef Alghadeer,opponent_coach_Youssef Anbar,opponent_coach_Z. Mamić,season_2018/2019,season_2019/2020,season_2020/2021
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1437,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [68]:
df = pd.concat([df, dummy], axis=1)

In [70]:
df.drop(columns=['team', 'formation', "coach", "opponent",
                           "opponent_formation", "opponent_coach","season"], inplace=True)

In [71]:
df.shape

(1440, 259)

In [73]:
train_data = df.loc[~df["fixture_id"].isin(test_fixtures)]
test_data =  df.loc[df["fixture_id"].isin(test_fixtures)]

In [75]:
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)

We have to write custom Dataset class so that pytorch's Dataloader can accpet it

In [77]:
df.columns[:20]

Index(['fixture_id', 'isHome', 'round', 'team_goals', 'opponent_goals', 'pts',
       'opponent_pts', 'last_performance', 'opponent_last_performance',
       'last_against_performance', 'opponent_last_against_performance',
       'pts_difference', 'top_scorrer', 'main_gk', 'main_captain',
       'opponent_top_scorrer', 'opponent_main_gk', 'opponent_main_captain',
       'team_Abha', 'team_Al Baten'],
      dtype='object')

In [142]:
from torch.utils.data import  Dataset
class SaudiLeagueDataset(torch.utils.data.Dataset):
    
    def __init__(self, file_path):
        
        self.data = pd.read_csv(file_path)
        self.data.drop("fixture_id", axis=1, inplace=True)
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # load image as ndarray type (Height * Width * Channels)
        # be carefull for converting dtype to np.uint8 [Unsigned integer (0 to 255)]
        # in this example, i don't use ToTensor() method of torchvision.transforms
        # so you can convert numpy ndarray shape to tensor in PyTorch (H, W, C) --> (C, H, W)
        
        outcome = self.data.loc[index, ["team_goals", "opponent_goals"]].values.astype(np.float32)

        match = self.data.drop(columns=["team_goals", "opponent_goals"]).iloc[index].values.astype(np.float32)

            
        return match, outcome

In [143]:
train_dataset = SaudiLeagueDataset("train.csv")
test_dataset = SaudiLeagueDataset("test.csv")

In [144]:
train_dataset[0]

(array([1.        , 0.        , 0.        , 0.        , 0.51744187,
        0.55813956, 0.6104651 , 0.5       , 0.50581396, 1.        ,
        1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

### perfect!

In [145]:
len(train_dataset)

1152

In [146]:
len(test_dataset)

288

In [219]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

train_size = len(train_data)
indices = list(range(train_size))
np.random.shuffle(indices)

split = int(np.floor(train_size * 0.15)) #get 15% of the training data
train_idx, valid_idx = indices[split:], indices[:split]

batch_size = 2

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

loaders = {'train':DataLoader(train_dataset, batch_size=batch_size,
    sampler=train_sampler, num_workers=0),
                   'valid': DataLoader(train_dataset, batch_size=batch_size, 
    sampler=valid_sampler, num_workers=0),
                   'test': DataLoader(test_dataset, batch_size=batch_size, 
                                          num_workers=0, shuffle=True, pin_memory=True)}

In [220]:
for match, outcome in loaders["valid"]:
    print(match)
    print(outcome)
    break

tensor([[0.0000, 0.5862, 0.5507, 0.3043, 0.5640, 0.5581, 0.5407, 0.7024, 0.4651,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0

In [221]:
train_dataset[0][0].shape

(256,)

In [222]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, 8)
        self.fc6 = nn.Linear(8, 2)
        
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        x = self.fc6(x)

        return x

model = Net()
print(model)

if cuda_available:
    model.cuda()

Net(
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=16, bias=True)
  (fc5): Linear(in_features=16, out_features=8, bias=True)
  (fc6): Linear(in_features=8, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [223]:
criterion = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)


As we train we'll keep an eye on the validation loss

In [224]:
# number of epochs to train the model
n_epochs = 50 # you may increase this number to train a final model

valid_loss_min = np.Inf # track change in validation loss

for epoch in range(1, n_epochs+1):

    # keep track of training and validation loss
    train_loss = 0.0
    valid_loss = 0.0
    
    ###################
    # train the model #
    ###################
    model.train()
    for matches, outcomes in loaders["train"]:
        # move tensors to GPU if CUDA is available
        if cuda_available:
            matches, outcomes = matches.cuda(), outcomes.cuda()
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(matches)
        # calculate the batch loss
        loss = criterion(output, outcomes)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update training loss
        train_loss += loss.item()*data.size(0)
        
    ######################    
    # validate the model #
    ######################
    model.eval()
    for matches, outcomes in loaders["valid"]:
        # move tensors to GPU if CUDA is available
        if cuda_available:
            matches, outcomes = matches.cuda(), outcomes.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(matches)
        # calculate the batch loss
        loss = criterion(output, outcomes)
        # update average validation loss 
        valid_loss += loss.item()*data.size(0)
    
    # calculate average losses
    train_loss = train_loss/len(loaders["train"].dataset)
    valid_loss = valid_loss/len(loaders["valid"].dataset)
        
    # print training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

Epoch: 1 	Training Loss: 9.318134 	Validation Loss: 1.449656
Validation loss decreased (inf --> 1.449656).  Saving model ...
Epoch: 2 	Training Loss: 8.375775 	Validation Loss: 1.382202
Validation loss decreased (1.449656 --> 1.382202).  Saving model ...
Epoch: 3 	Training Loss: 8.090817 	Validation Loss: 1.378681
Validation loss decreased (1.382202 --> 1.378681).  Saving model ...
Epoch: 4 	Training Loss: 7.914113 	Validation Loss: 1.362160
Validation loss decreased (1.378681 --> 1.362160).  Saving model ...
Epoch: 5 	Training Loss: 7.725298 	Validation Loss: 1.422006
Epoch: 6 	Training Loss: 7.522806 	Validation Loss: 1.386033
Epoch: 7 	Training Loss: 7.208323 	Validation Loss: 1.450074
Epoch: 8 	Training Loss: 6.960209 	Validation Loss: 1.516313
Epoch: 9 	Training Loss: 6.574895 	Validation Loss: 1.506049
Epoch: 10 	Training Loss: 6.356358 	Validation Loss: 1.612916
Epoch: 11 	Training Loss: 6.002570 	Validation Loss: 1.572099
Epoch: 12 	Training Loss: 5.977723 	Validation Loss: 1.6

KeyboardInterrupt: 

In [None]:
model.load_state_dict(torch.load('model.pt'))

In [None]:
test_loss = 0.0

model.eval()
# iterate over test data
for matches, outcomes in loaders["test"]:
    # move tensors to GPU if CUDA is available
    if cuda_available:
        matches, outcomes  = matches.cuda(), outcomes.cuda()
    # forward pass: compute predicted outputs by passing inputs to the model
    output = model(matches)
    # calculate the batch loss
    loss = criterion(output, outcomes)
    # update test loss 
    test_loss += loss.item()*data.size(0)
        
# average test loss
test_loss = test_loss/len(loaders["test"].dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))


### from sklearn.ensemble import RandomForestRegressor

In [175]:
model_rf = RandomForestRegressor()

In [176]:
model_rf.fit(train_data.drop(columns=["fixture_id", "team_goals", 'opponent_goals']), train_data["team_goals"])

RandomForestRegressor()

In [177]:
preds = model_rf.predict(test_data.drop(columns=["fixture_id", "team_goals", 'opponent_goals']))

In [185]:
from sklearn.metrics import mean_squared_error

In [186]:
mean_squared_error(test_data.team_goals, preds)

1.6360503472222223