# Rocket League Challenge

The dataset consists of sequences of snapshots of the state of a Rocket League match, including position and velocity of all players and the ball, as well as extra information.

**Goal of the competition**: The goal of the competition is to predict -- from a given snapshot in the game -- for each team, the probability that they will score within the next 10 seconds of game time.

The data was taken from professional Rocket League matches. Each event consists of a chronological series of frames recorded at 10 frames per second. All events begin with a kickoff, and most end in one team scoring a goal, but some are truncated and end with no goal scored due to circumstances which can cause gameplay strategies to shift, for example 1) nearing end of regulation (where the game continues until the ball touches the ground) or 2) becoming non-competitive, eg one team winning by 3+ goals with little time remaining.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install pytorch_lightning
!pip install icecream
!pip install pprinter
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from icecream import ic
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Creating Datasets

In [5]:


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

files_name = []
import os
for dirname, _, filenames in os.walk('//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        files_name.append(os.path.join(dirname, filename))




//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/test_dtypes.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/sample_submission.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/test.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_0.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_1.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_2.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_3.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_4.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_5.csv
//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/t

In [6]:
# reading train dataset
# we will not use all the dataset but just a sample of each train dataset: 
import gc 
SAMPLE=0.2
import pandas as pd
path_to_data = os.path.dirname(files_name[0])
print(path_to_data)
# pfingt

dtypes_df = pd.read_csv('//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/train_dtypes.csv')
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}


cols = list(dtypes.keys())
train_df = pd.DataFrame({}, columns=cols)
for i in range(10):
    df_tmp = pd.read_csv(f'{path_to_data}/train_{i}.csv', dtype=dtypes)
    if SAMPLE < 1:
        df_tmp = df_tmp.sample(frac=SAMPLE, random_state=42)

    train_df = pd.concat([train_df,df_tmp])

    del df_tmp
    gc.collect()


  # train +   = pd.read_csv(i, dtype=dtypes)

//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022


# Data exploration

In [7]:
# reading test dataset
dtypes_df = pd.read_csv('//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/test_dtypes.csv')
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}
test = pd.read_csv('//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/test.csv', dtype=dtypes)

In [8]:
test.head()

Unnamed: 0,id,ball_pos_x,ball_pos_y,ball_pos_z,ball_vel_x,ball_vel_y,ball_vel_z,p0_pos_x,p0_pos_y,p0_pos_z,...,p5_vel_x,p5_vel_y,p5_vel_z,p5_boost,boost0_timer,boost1_timer,boost2_timer,boost3_timer,boost4_timer,boost5_timer
0,0,-56.270802,29.51,17.3486,24.499399,-1.3114,11.006801,-35.776199,73.136803,1.248,...,3.9484,-16.7108,0.0074,71.0,0.0,-3.263672,-6.132812,-6.875,-7.015625,-3.230469
1,1,2.8528,70.195999,8.949,-8.1522,-65.577202,18.5364,22.926001,87.5438,0.3396,...,-15.4968,-14.8766,0.005,66.6875,0.0,-1.615234,-5.96875,-5.503906,0.0,-6.511719
2,2,52.366402,-98.036797,14.249001,-2.0,-45.291401,-15.4234,51.288998,-102.055595,6.357,...,0.9058,-27.413601,0.0042,80.375,-0.031586,0.0,0.0,-3.128906,0.0,-3.677734
3,3,36.365402,53.961403,23.161798,24.829399,-10.3286,-0.918,16.424799,0.9328,0.3402,...,-5.0532,-15.106999,0.0052,96.0625,-6.429688,-4.089844,-7.832031,-5.761719,0.0,-3.923828
4,4,-23.2624,-53.391003,21.2096,-1.6676,-24.778799,3.4812,-19.681801,-93.913605,4.7832,...,22.200001,24.928001,0.0042,0.0,-9.882812,0.0,-0.445557,-0.491455,-7.828125,0.0


In [9]:
print('Train data shape:', train_df.shape)
print('Test data shape:', test.shape)

Train data shape: (4239608, 61)
Test data shape: (701143, 55)


In [10]:
missing_values_train = train_df.isna().sum().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().sum().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

Missing values in train data: [2569435]
Missing values in test data: [261422]


In [11]:
# handle missing values for now : 
train0_df = train_df.dropna()
test = test.fillna(value=test.mean())

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [12]:
missing_values_train = train0_df.isna().sum().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test.isna().sum().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

Missing values in train data: []
Missing values in test data: [37346]


In [13]:
input_cols = [
    'ball_pos_x', 'ball_pos_y', 'ball_pos_z', 'ball_vel_x', 'ball_vel_y', 'ball_vel_z', 
    'p0_pos_x', 'p0_pos_y', 'p0_pos_z', 'p0_vel_x', 'p0_vel_y', 'p0_vel_z', 
    'p1_pos_x', 'p1_pos_y', 'p1_pos_z', 'p1_vel_x', 'p1_vel_y', 'p1_vel_z',
    'p2_pos_x', 'p2_pos_y', 'p2_pos_z', 'p2_vel_x', 'p2_vel_y', 'p2_vel_z',
    'p3_pos_x', 'p3_pos_y', 'p3_pos_z', 'p3_vel_x', 'p3_vel_y', 'p3_vel_z',
    'p4_pos_x', 'p4_pos_y', 'p4_pos_z', 'p4_vel_x', 'p4_vel_y', 'p4_vel_z',
    'p5_pos_x', 'p5_pos_y', 'p5_pos_z', 'p5_vel_x', 'p5_vel_y', 'p5_vel_z',
    'p0_boost', 'p1_boost',  'p2_boost', 'p3_boost', 'p4_boost', 'p5_boost',
    'boost0_timer', 'boost1_timer', 'boost2_timer', 'boost3_timer', 'boost4_timer', 'boost5_timer'
]

In [14]:
output_cols = ['team_A_scoring_within_10sec', 'team_B_scoring_within_10sec']

In [15]:
def int_to_grid_coord(k, n):
    return (k // n) + 1, (k % n) + 1

Exploration is not possible on all data because of ram issues so we will get random samples of these data *

In [16]:
SAMPLE_EXPLO=0.0001
explo_df = train_df.sample(frac=SAMPLE_EXPLO)

In [17]:
explo_df.size

25864

In [18]:
## visualisation

import plotly.express as px
import plotly.graph_objects as go
from plotly import subplots

def plot_distributions(df, row_count, col_count, title, height):
    features = df.columns
    fig = subplots.make_subplots(
        rows=row_count, cols=col_count,
        subplot_titles=features
    )

    for k, col in enumerate(features):
        i, j = int_to_grid_coord(k, col_count)

        fig.add_trace(
            go.Histogram(
                x=df[col].astype('float32'),
                name=col
            ),
            row=i, col=j
        )

    fig.update_layout(
        title=title,
        height=row_count * height,
        showlegend=False
    )

    return fig

plot_distributions(explo_df[input_cols],9,6,"first visualisation of inputs",300)

# Feature Reduction

In [19]:
train = train_df.copy(deep=True)  # for easier exploration & for having a dataset checkpoint

train.keys()

Index(['game_num', 'event_id', 'event_time', 'ball_pos_x', 'ball_pos_y',
       'ball_pos_z', 'ball_vel_x', 'ball_vel_y', 'ball_vel_z', 'p0_pos_x',
       'p0_pos_y', 'p0_pos_z', 'p0_vel_x', 'p0_vel_y', 'p0_vel_z', 'p0_boost',
       'p1_pos_x', 'p1_pos_y', 'p1_pos_z', 'p1_vel_x', 'p1_vel_y', 'p1_vel_z',
       'p1_boost', 'p2_pos_x', 'p2_pos_y', 'p2_pos_z', 'p2_vel_x', 'p2_vel_y',
       'p2_vel_z', 'p2_boost', 'p3_pos_x', 'p3_pos_y', 'p3_pos_z', 'p3_vel_x',
       'p3_vel_y', 'p3_vel_z', 'p3_boost', 'p4_pos_x', 'p4_pos_y', 'p4_pos_z',
       'p4_vel_x', 'p4_vel_y', 'p4_vel_z', 'p4_boost', 'p5_pos_x', 'p5_pos_y',
       'p5_pos_z', 'p5_vel_x', 'p5_vel_y', 'p5_vel_z', 'p5_boost',
       'boost0_timer', 'boost1_timer', 'boost2_timer', 'boost3_timer',
       'boost4_timer', 'boost5_timer', 'player_scoring_next',
       'team_scoring_next', 'team_A_scoring_within_10sec',
       'team_B_scoring_within_10sec'],
      dtype='object')

### **Let's define some new features :**

- distance from the ball 
- absolute speed of the ball 
- minimum distance from a player to the goal 
- elevated before a goal or not ? 
- velocity magnitude of the players

#### Absolute speed of the ball

In [20]:
# absolute speed of the ball :

array1 = train['ball_vel_x'].values*train['ball_vel_x'].values + train['ball_vel_y'].values*train['ball_vel_y'].values + train['ball_vel_z'].values*train['ball_vel_z'].values


train["abs_ball_speed"]=[np.sqrt(i) for i in array1]

train.keys()


Index(['game_num', 'event_id', 'event_time', 'ball_pos_x', 'ball_pos_y',
       'ball_pos_z', 'ball_vel_x', 'ball_vel_y', 'ball_vel_z', 'p0_pos_x',
       'p0_pos_y', 'p0_pos_z', 'p0_vel_x', 'p0_vel_y', 'p0_vel_z', 'p0_boost',
       'p1_pos_x', 'p1_pos_y', 'p1_pos_z', 'p1_vel_x', 'p1_vel_y', 'p1_vel_z',
       'p1_boost', 'p2_pos_x', 'p2_pos_y', 'p2_pos_z', 'p2_vel_x', 'p2_vel_y',
       'p2_vel_z', 'p2_boost', 'p3_pos_x', 'p3_pos_y', 'p3_pos_z', 'p3_vel_x',
       'p3_vel_y', 'p3_vel_z', 'p3_boost', 'p4_pos_x', 'p4_pos_y', 'p4_pos_z',
       'p4_vel_x', 'p4_vel_y', 'p4_vel_z', 'p4_boost', 'p5_pos_x', 'p5_pos_y',
       'p5_pos_z', 'p5_vel_x', 'p5_vel_y', 'p5_vel_z', 'p5_boost',
       'boost0_timer', 'boost1_timer', 'boost2_timer', 'boost3_timer',
       'boost4_timer', 'boost5_timer', 'player_scoring_next',
       'team_scoring_next', 'team_A_scoring_within_10sec',
       'team_B_scoring_within_10sec', 'abs_ball_speed'],
      dtype='object')

In [21]:
train["abs_ball_speed"]

440953     27.916628
176360     23.706104
282629     28.322266
1765627    39.604359
1274297     9.061370
             ...    
1930896    32.447304
1393704    19.497972
274074     45.880203
824495     36.650852
1129841    42.290390
Name: abs_ball_speed, Length: 4239608, dtype: float32

In [22]:

#checking data for a given player
scorer_key = []
for i in train.keys():
  if "p3" in i:
    scorer_key.append(i)
print(scorer_key)

train[scorer_key]


['p3_pos_x', 'p3_pos_y', 'p3_pos_z', 'p3_vel_x', 'p3_vel_y', 'p3_vel_z', 'p3_boost']


Unnamed: 0,p3_pos_x,p3_pos_y,p3_pos_z,p3_vel_x,p3_vel_y,p3_vel_z,p3_boost
440953,5.090800,88.712997,0.340200,10.846200,-18.751200,0.0050,73.312500
176360,37.023800,77.138397,20.601599,12.437800,-17.912600,11.7516,65.250000
282629,-25.050600,-3.820200,0.340200,1.238600,-20.384001,0.0052,79.625000
1765627,61.479599,-15.574599,0.339800,-17.164000,41.382004,0.0096,89.000000
1274297,81.566803,71.893402,12.096999,-0.067800,-2.471600,33.4324,89.500000
...,...,...,...,...,...,...,...
1930896,37.681599,88.709808,0.340400,-4.387800,-20.267200,0.0030,33.718750
1393704,11.837399,98.623001,0.340200,-23.418201,7.431200,0.0052,87.812500
274074,20.112400,60.547997,0.340800,45.811001,4.070000,0.0074,87.812500
824495,8.249001,82.865402,18.618401,21.754400,-19.930601,-0.4798,16.421875


In [23]:
# grouping by player and ball cat : 

vel_groups = {
    f"{el}_vel": [f'{el}_vel_x', f'{el}_vel_y', f'{el}_vel_z']
    for el in ['ball'] + [f'p{i}' for i in range(6)]
}
pos_groups = {
    f"{el}_pos": [f'{el}_pos_x', f'{el}_pos_y', f'{el}_pos_z']
    for el in ['ball'] + [f'p{i}' for i in range(6)]
}
pos_groups


{'ball_pos': ['ball_pos_x', 'ball_pos_y', 'ball_pos_z'],
 'p0_pos': ['p0_pos_x', 'p0_pos_y', 'p0_pos_z'],
 'p1_pos': ['p1_pos_x', 'p1_pos_y', 'p1_pos_z'],
 'p2_pos': ['p2_pos_x', 'p2_pos_y', 'p2_pos_z'],
 'p3_pos': ['p3_pos_x', 'p3_pos_y', 'p3_pos_z'],
 'p4_pos': ['p4_pos_x', 'p4_pos_y', 'p4_pos_z'],
 'p5_pos': ['p5_pos_x', 'p5_pos_y', 'p5_pos_z']}

#### Distance from the ball

In [24]:
#distance from the ball: 
def euclidian_dist(x):
    return np.linalg.norm(x, axis=1)

# euclidian_dist(train[pos_groups["p0_pos"]].values - train[pos_groups["ball_pos"]].values)
# # train['p0_pos_x']
# # print(euclidian_dist(train['ball_pos_x'].values-train['p0_pos_x'].values))

for col, vec in pos_groups.items():
  print(vec)
  train[col + '_dist_ball'] = euclidian_dist(train[vec].values - train[pos_groups["ball_pos"]].values)






['ball_pos_x', 'ball_pos_y', 'ball_pos_z']
['p0_pos_x', 'p0_pos_y', 'p0_pos_z']
['p1_pos_x', 'p1_pos_y', 'p1_pos_z']
['p2_pos_x', 'p2_pos_y', 'p2_pos_z']
['p3_pos_x', 'p3_pos_y', 'p3_pos_z']
['p4_pos_x', 'p4_pos_y', 'p4_pos_z']
['p5_pos_x', 'p5_pos_y', 'p5_pos_z']


In [25]:
game_1 = train[train["game_num"]==1]

# check at what distance from the ball was the player that scores before shooting. 
min((x) for x in game_1['p3_pos_dist_ball'])


3.6830060482025146

#### Velocity Magnitude

In [26]:

# velocity magnitude
for col, vec in vel_groups.items():
    train[col] = euclidian_dist(train[vec])


#### Minimum distance from the goal 

In [27]:
# was there a goal in cage ? 
# minimum distance from the goal
# we are going to assume that there was no goal if the closest palyer from the ball is the one who ends up scoring. 
#Is there a 'goal keeper', i.e. a player defending in goal, or is it an open goal? The right feature for this would be the minimum distance of a player to the goal.

#lets compute minimum distance from the ball at time 


# pas encore fait



In [28]:
# who is the closest to the ball ? 


#### Elevation of the scoring player

In [29]:
# to do 

# Feature Reduction 

We might want to use algorithms for feature reductions ? 

In [30]:
# # Standarise the Data
# X_org = image_matrix.copy()
# sc = StandardScaler()
# X = sc.fit_transform(X_org)
# # this is the size of our encoded representations
# encoding_dim = reduced_pixel 
# # this is our input placeholder
# input_img = Input(shape=(img.width,))
# # "encoded" is the encoded representation of the input
# encoded = Dense(encoding_dim, activation='linear')(input_img)
# # "decoded" is the lossy reconstruction of the input
# decoded = Dense(img.width, activation=None)(encoded)
# # this model maps an input to its reconstruction
# autoencoder = Model(input_img, decoded)
# #Encoder
# encoder = Model(input_img, encoded)
# # create a placeholder for an encoded (32-dimensional) input
# encoded_input = Input(shape=(encoding_dim,))
# # retrieve the last layer of the autoencoder model
# decoder_layer = autoencoder.layers[-1]
# # create the decoder model
# decoder = Model(encoded_input, decoder_layer(encoded_input))
# autoencoder.compile(optimizer='adadelta', loss='mean_squared_error')
# autoencoder.fit(X, X,
#                 epochs=500,
#                 batch_size=16,
#                 shuffle=True)
# encoded_imgs = encoder.predict(X)
# decoded_imgs = decoder.predict(encoded_imgs)

#  data cleaning

In [31]:
train.keys()

Index(['game_num', 'event_id', 'event_time', 'ball_pos_x', 'ball_pos_y',
       'ball_pos_z', 'ball_vel_x', 'ball_vel_y', 'ball_vel_z', 'p0_pos_x',
       'p0_pos_y', 'p0_pos_z', 'p0_vel_x', 'p0_vel_y', 'p0_vel_z', 'p0_boost',
       'p1_pos_x', 'p1_pos_y', 'p1_pos_z', 'p1_vel_x', 'p1_vel_y', 'p1_vel_z',
       'p1_boost', 'p2_pos_x', 'p2_pos_y', 'p2_pos_z', 'p2_vel_x', 'p2_vel_y',
       'p2_vel_z', 'p2_boost', 'p3_pos_x', 'p3_pos_y', 'p3_pos_z', 'p3_vel_x',
       'p3_vel_y', 'p3_vel_z', 'p3_boost', 'p4_pos_x', 'p4_pos_y', 'p4_pos_z',
       'p4_vel_x', 'p4_vel_y', 'p4_vel_z', 'p4_boost', 'p5_pos_x', 'p5_pos_y',
       'p5_pos_z', 'p5_vel_x', 'p5_vel_y', 'p5_vel_z', 'p5_boost',
       'boost0_timer', 'boost1_timer', 'boost2_timer', 'boost3_timer',
       'boost4_timer', 'boost5_timer', 'player_scoring_next',
       'team_scoring_next', 'team_A_scoring_within_10sec',
       'team_B_scoring_within_10sec', 'abs_ball_speed', 'ball_pos_dist_ball',
       'p0_pos_dist_ball', 'p1_pos_dist_b

In [32]:
#Visualisation of new fe@atures : 
## visualisation
new_cols = ['abs_ball_speed', 'ball_pos_dist_ball',
       'p0_pos_dist_ball', 'p1_pos_dist_ball', 'p2_pos_dist_ball',
       'p3_pos_dist_ball', 'p4_pos_dist_ball', 'p5_pos_dist_ball', 'ball_vel',
       'p0_vel', 'p1_vel', 'p2_vel', 'p3_vel', 'p4_vel', 'p5_vel']


import plotly.express as px
import plotly.graph_objects as go
from plotly import subplots

def plot_distributions(df, row_count, col_count, title, height,frac = 0.00005):
    df = df.sample(frac = frac)
    features = df.columns
    fig = subplots.make_subplots(
        rows=row_count, cols=col_count,
        subplot_titles=features
    )

    for k, col in enumerate(features):
        i, j = int_to_grid_coord(k, col_count)

        fig.add_trace(
            go.Histogram(
                x=df[col].astype('float32'),
                name=col
            ),
            row=i, col=j
        )

    fig.update_layout(
        title=title,
        height=row_count * height,
        showlegend=False
    )

    return fig

plot_distributions(train[new_cols],5,3,"first visualisation of inputs",300)

We drop 'game_num', 'event_id', 'event_time" since they are not useful. 
we drop   'player_scoring_next', 'team_scoring_next''team_A_scoring_within_10sec',
                     'team_B_scoring_within_10sec' since they are data leakage/label
                     
FInally we also drop these features that we had compute since they are either not relevant or present twice in dataset "abs_ball_speed",'ball_pos_dist_ball


                   

In [33]:
def preprocess_data(df):
  df = df.dropna().copy()

  return ({ 'A': df['team_A_scoring_within_10sec'], 'B': df['team_B_scoring_within_10sec'] },
            df.drop(['game_num', 'event_id', 'event_time',
                     'player_scoring_next', 'team_scoring_next',
                     'team_A_scoring_within_10sec',
                     'team_B_scoring_within_10sec',"abs_ball_speed",'ball_pos_dist_ball',], axis=1))


In [34]:
test=test.drop(["id"],axis=1)

In [35]:
# drop some colums : 
y_train,X_train = preprocess_data(train)

##### train and test data

In [36]:
ic(y_train["A"].shape)
ic(y_train['B'].shape)
ic(X_train.shape)
ic(test.shape)

ic| y_train["A"].shape: (3108785,)
ic| y_train['B'].shape: (3108785,)
ic| X_train.shape: (3108785, 67)
ic| test.shape: (701143, 54)


(701143, 54)

# Using XGBoost 


Since these are tabular data we think to XGboosting

In [37]:

import xgboost as xgb
from sklearn.model_selection import cross_validate  # k-fold Cross Validation
from sklearn.preprocessing import LabelEncoder 

*We'll be predicting the probability of team A scoring and team B scoring with 2 separate models.*
XGBoost n'est pas le meilleur avec du suréchantillonage. Il faudra surement réduire le nombre de donnnées pour éviter les éccueils. 



#### model A 


In [38]:
xgb.XGBClassifier??

In [41]:
# training and cross validation

N_ESTIMATORS = 2000
MAX_DEPTH = 8
LEARNING_RATE = 0.01
N_SPLITS = 5
my_seed = 42

param={}
param['booster'] = 'dart'
param['max_depth']=MAX_DEPTH
param["learning_rate"]=LEARNING_RATE
param["n_estimators"]=N_ESTIMATORS
param['objective']="binary:logistic"
param["tree_method"]='gpu_hist'
param["subsample"]=0.1


model_a = xgb.XGBClassifier()
model_a.set_params(**param)


scoring = 'neg_log_loss'
FOLDS = 5

dart


In [None]:
import pprint
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def run_model(model,X,y,test): 
  scores = []
  test_predictions = []
  cv = KFold(n_splits=N_SPLITS, random_state=my_seed, shuffle=True)

  for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    train_X, val_X = X.iloc[train_idx], X.iloc[test_idx]
    train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]


    model.fit(train_X,train_y)
    predictions = model.predict_proba(val_X)
    score = roc_auc_score(val_y, predictions)
    scores.append(score)
    print(f"Fold {fold + 1} \t\t AUC: {score}")

    test_predictions.append(model.predict_proba(test))
    del model 
    gc.collect()



## use a sample of data again : 


run_model(model_a,X_train,y_train["A"],test)

# Online Learning ?

## Dummy training with small CNN 

In [None]:
import torch
from torch.functional import norm
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,random_split,TensorDataset
from pathlib import Path
from torch import optim
import time
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from icecream import ic
import optuna
from optuna.integration import PyTorchLightningPruningCallback
import datetime
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')




In [None]:
train_X_tensor=torch.tensor(train_X.values)
train_X_tensor.shape
train_y_A_tensor=torch.tensor(train_y['A'].values)
train_y_B_tensor=torch.tensor(train_y['B'].values)

In [None]:
#### build dataset
from torch.utils.data import Dataset, DataLoader

class RocketDataset(Dataset):
  """
  This is a custom dataset class. It can get more complex than this, but simplified so you can understand what's happening here without
  getting bogged down by the preprocessing
  """
  def __init__(self, X, Y):
    self.X = X
    print(X.shape)
    self.Y = Y
    print(Y.shape)
    if len(self.X) != len(self.Y):
      raise Exception("The length of X does not match the length of Y")

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
    # note that this isn't randomly selecting. It's a simple get a single item that represents an x and y
      _x = self.X[index]
      _y = self.Y[index]
  
      return _x, _y



loader_A = DataLoader(RocketDataset(train_X_tensor, train_y_A_tensor), batch_size=32, shuffle=True)
loader_B= DataLoader(RocketDataset(train_X_tensor, train_y_B_tensor), batch_size=32, shuffle=True)
loader_train = {'A':loader_A,'B':loader_B}

for i in range(0, 5):
  # next x and next y
  x, y = next(iter(loader_A))
  print(x)
  print(y)

In [None]:
ic(train_y["A"].shape)
ic(train_y['B'].shape)
ic(train_X.shape)
ic(test.shape)

In [None]:
BATCH_SIZE=32
class training_global(nn.Module): 
    def __init__(self,criterion,device,opt,weight_decay,regularization=None,ckpt_save_path=None):
        super().__init__()
        self.criterion = criterion
        self.device = device
        self.regularization = regularization
        self.optimizer = opt
        self.state={}
        self.ckpt_save_path= ckpt_save_path
        self.weight_decay = weight_decay
    
    def __train_epoch(self,traindata,epoch):
        epoch_train_loss = 0
        epoch_train_acc  = 0 
        epoch_test_loss  = 0
        epoch_test_acc   = 0 
        for idx, data in enumerate(traindata):
            
            self.opt.zero_grad()
            input,label = data
            
            input = input.reshape(-1,self.input_size)
            b_size,n_features = input.shape
            if b_size==BATCH_SIZE:
                output = self(input,'Train',epoch) # self c'est le modèle
  
                # output = output.type(torch.LongTensor)
                label = label.type(torch.LongTensor)

                loss = self.criterion(output,label)

                if self.regularization == 'L1':
                    l1_lambda = 0.007253
                    l1_penalty = sum(p.abs().sum() for p in self.parameters())
                    loss = loss + l1_lambda * l1_penalty
                if self.regularization =="L2":
                    l2_lambda = 0.07253
                    l2_penalty = sum(p.pow(2.0).sum() for p in self.parameters())
                    loss = loss + l2_lambda*l2_penalty

                n_correct = (torch.argmax(output,dim=1)==label).sum().item()
                total     = label.size(0)
            
                epoch_train_acc += n_correct/total
                loss.backward()
                self.opt.step()
                epoch_train_loss += loss.item()

        # with torch.no_grad():
        #     for idx,data in enumerate(testdata):
        #         input,label = data
        #         input = input.reshape(-1,self.input_size)
        #         b_size,n_features = input.shape
        #         if b_size==BATCH_SIZE:
        #             output = self(input,'Test',epoch) # self c'est le modèle
        #             loss = self.criterion(output,label)
        #             n_correct = (torch.argmax(output,dim=1)==label).sum().item()
        #             total     = label.size(0)
        #             epoch_test_acc += n_correct/total
        #             epoch_test_loss += loss.item()
        


        return epoch_train_loss/len(traindata),epoch_train_acc/len(traindata)

    def __validate(self, dataloader,epoch):
        epoch_loss = 0
        epoch_acc  = 0
        for idx, data in enumerate(dataloader):
            self.opt.zero_grad()
            input,label = data
            
            input = input.reshape(-1,self.input_size)
            b_size,n_features = input.shape
            if b_size==BATCH_SIZE:
                output = self(input,'val',epoch) # self c'est le modèle
                loss = self.criterion(output,label)
                n_correct = (torch.argmax(output,dim=1)==label).sum().item()
                total     = label.size(0)
                epoch_acc += n_correct/total
                epoch_loss += loss.item()

            
        return epoch_loss/len(dataloader), epoch_acc/len(dataloader)


    def fit(self, traindata,testdata=None,validation_data=None,batch_size=300, start_epoch=0, n_epochs=1000, lr=0.001, verbose=10,ckpt=None):
        ic(lr)
        
        parameters = self.parameters()
        if self.optimizer=="SGD":
            self.opt = optim.SGD(parameters,lr=lr,momentum=0.9)
        if self.optimizer=='Adam':
            self.opt = torch.optim.Adam(parameters, lr=lr, weight_decay=self.weight_decay, amsgrad=False)
        start_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        
      

        if ckpt:
            state = torch.load(ckpt)
            start_epoch = state['epoch']
            self.load_state_dict(state['state_dict'])
            for g in self.opt.param_groups:
                g['lr'] = state['lr']
            
        for epoch in range(start_epoch,n_epochs):
            epoch_train_loss,epoch_train_acc = self.__train_epoch(traindata,epoch)
            print(f'\n Epoch {epoch+1} \n',
                        f'Train Loss= {epoch_train_loss:.4f}\n',f'Train Acc={epoch_train_acc:.4f}\n')
            
            

            # if epoch % 10 ==0 : 
            #     self.__weights_histo(epoch) #using weight histograms to have the weights of each linear layer. 
            # if validation_data is not None:
            #     with torch.no_grad():
            #         val_loss, val_acc = self.__validate(validation_data,epoch)
            #     print('Epoch {:2d} loss_val: {:1.4f}  val_acc: {:1.4f} '.format(epoch+1, val_loss, val_acc))
  
            if self.ckpt_save_path:
                self.state['lr'] = lr
                self.state['epoch'] = epoch
                self.state['state_dict'] = self.state_dict()
                if not os.path.exists(self.ckpt_save_path):
                    os.mkdir(self.ckpt_save_path)
                torch.save(self.state, os.path.join(self.ckpt_save_path, f'ckpt_{start_time}_epoch{epoch}.ckpt'))

   

In [None]:


class Net(training_global):
    def __init__(self,input_size,hidden_dim,output_size,criterion,device,opt,dropout, regularization=None,ckpt_save_path='//content/drive/MyDrive/Colab/HEC_MAJEUR/AI_ADVANCED/tabular-playground-series-oct-2022/runs',weight_decay=0):
        super(Net,self).__init__(regularization = regularization,criterion =criterion,device = device,opt =  opt,ckpt_save_path= ckpt_save_path,weight_decay=weight_decay)
        self.input_size = input_size
        self.output_size = output_size
        self.linear1 = nn.Linear(input_size,hidden_dim)
        self.linear2 = nn.Linear(hidden_dim,hidden_dim)
        self.linear3 = nn.Linear(hidden_dim,output_size)
        self.activation = nn.ReLU()
        self.grads={}
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        

    def forward(self,x,mode,epoch):
        "model changed to add dropout, regularisation and batchnorm"
        x1 = self.linear1(x)
        x1 = self.dropout1(x1)
        x2 = self.linear2(x1)
        x2 = self.dropout2(x2)
        out   = self.activation(self.linear3(x2))
        return out 
            



In [None]:
criterion = torch.nn.CrossEntropyLoss()
regularization = "L1"
net = Net(input_size=train_X.shape[1],hidden_dim=100,output_size=2,device = device,criterion=criterion,regularization = None,dropout=0.3, opt='Adam',weight_decay = 0.009348818325128058)


      

In [None]:
test_predictions = {'A': [], 'B': [] }
for key in test_predictions:

  net.fit(loader[key])

# XGBoost for tabular data