# Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR

# Load data

In [2]:
train_data = pd.read_csv('../Datasets/train_V2.csv')

# Data Fields
DBNOs - Number of enemy players knocked.

assists - Number of enemy players this player damaged that were killed by teammates.

boosts - Number of boost items used.

damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.

headshotKills - Number of enemy players killed with headshots.

heals - Number of healing items used.

Id - Player’s Id

killPlace - Ranking in match of number of enemy players killed.

killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.

killStreaks - Max number of enemy players killed in a short amount of time.

kills - Number of enemy players killed.

longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.

matchDuration - Duration of match in seconds.

matchId - ID to identify match. There are no matches that are in both the training and testing set.

matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.

rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.

revives - Number of times this player revived teammates.

rideDistance - Total distance traveled in vehicles measured in meters.

roadKills - Number of kills while in a vehicle.

swimDistance - Total distance traveled by swimming measured in meters.

teamKills - Number of times this player killed a teammate.

vehicleDestroys - Number of vehicles destroyed.

walkDistance - Total distance traveled on foot measured in meters.

weaponsAcquired - Number of weapons picked up.

winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.

groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.

numGroups - Number of groups we have data for in the match.

maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.

winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

# Pre-processing data

**1.Delete sample with missing data(-1)**

**2.Delete irrelevant data such as ID**

**3.Do One-Hot Encoding for "matchType"**

**4.Delete inconsistent values such as rankPoints**

In [3]:
# View the data
train_data

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.00,0,0,0,60,...,0,0.0000,0,0.000,0,0,244.80,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.040,0,0,1434.00,5,0,0.6400
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.00,0,0,0,47,...,0,0.0000,0,0.000,0,0,161.80,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90,0,0,0,75,...,0,0.0000,0,0.000,0,0,202.70,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.00,0,0,0,45,...,0,0.0000,0,0.000,0,0,49.75,2,0,0.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,afff7f652dbc10,d238e426f50de7,18492834ce5635,0,0,0.00,0,0,0,74,...,0,1292.0000,0,0.000,0,0,1019.00,3,1507,0.1786
4446962,f4197cf374e6c0,408cdb5c46b2ac,ee854b837376d9,0,1,44.15,0,0,0,69,...,0,0.0000,0,0.000,0,0,81.70,6,0,0.2935
4446963,e1948b1295c88a,e26ac84bdf7cef,6d0cd12784f1ab,0,0,59.06,0,0,0,66,...,0,0.0000,0,2.184,0,0,788.70,4,0,0.4815
4446964,cc032cdd73b7ac,c2223f35411394,c9c701d0ad758a,0,4,180.40,1,1,2,11,...,2,0.0000,0,0.000,0,0,2748.00,8,0,0.8000


## One-Hot Encoding specification
\begin{align*}solo &= 1000000000000000 \\
duo &= 0100000000000000\\
squad &= 0010000000000000\\
solo-fpp &= 0001000000000000\\
duo-fpp &= 0000100000000000\\
squad-fpp &= 0000010000000000\\
normal-solo-fpp &= 0000001000000000\\
crashfpp &= 0000000100000000\\
flaretpp &= 0000000010000000\\
normal-squad-fpp &= 0000000001000000\\
normal-duo-fpp &= 0000000000100000\\
normal-squad &= 0000000000010000\\
flarefpp &= 0000000000001000\\
crashtpp &= 0000000000000100\\
normal-duo &= 0000000000000010\\
normal-solo &= 0000000000000001
\end{align*}









In [4]:
def filterPUBG(train_data):
    
    #Delete irrelevant data: Id, GroupId, matchId
    #Delete inconsistent data (for now): rankPoints, killPoints, winPoints
    train_value = train_data.drop(["Id","groupId","matchId","rankPoints","killPoints","winPoints"],axis=1)

    #Delete the data with missing values (-1)
    train_value = train_value[train_value.select_dtypes(include=[np.number]).ge(0).all(1)]
    
    #Extract the target values
    y = train_value["winPlacePerc"]
    
    #Delete winPlacePerc
    filtered = train_value.drop(["winPlacePerc"],axis=1)
    
    #Do One-Hot Encoding for "matchType"
    X = filtered.replace(
        ["solo", "duo", "squad", 
         "solo-fpp", "duo-fpp", "squad-fpp",
         "normal-solo-fpp","crashfpp","flaretpp",
         "normal-squad-fpp","normal-duo-fpp","normal-squad",
         "flarefpp","crashtpp","normal-duo",
         "normal-solo"],
        ["1000000000000000", "0100000000000000", "0010000000000000", 
         "0001000000000000", "0000100000000000", "0000010000000000",
         "0000001000000000", "0000000100000000", "0000000010000000",
         "0000000001000000", "0000000000100000", "0000000000010000",
         "0000000000001000", "0000000000000100", "0000000000000010",
         "0000000000000001"]
    )
    
    return X,y,filtered

In [5]:
X,y,filtered = filterPUBG(train_data)
X

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,...,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired
0,0,0,0.00,0,0,0,60,0,0,0.00,...,28,26,0,0.0000,0,0.000,0,0,244.80,1
1,0,0,91.47,0,0,0,57,0,0,0.00,...,26,25,0,0.0045,0,11.040,0,0,1434.00,5
2,1,0,68.00,0,0,0,47,0,0,0.00,...,50,47,0,0.0000,0,0.000,0,0,161.80,2
3,0,0,32.90,0,0,0,75,0,0,0.00,...,31,30,0,0.0000,0,0.000,0,0,202.70,3
4,0,0,100.00,0,0,0,45,1,1,58.53,...,97,95,0,0.0000,0,0.000,0,0,49.75,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,0,0,0.00,0,0,0,74,0,0,0.00,...,29,28,0,1292.0000,0,0.000,0,0,1019.00,3
4446962,0,1,44.15,0,0,0,69,0,0,0.00,...,93,93,0,0.0000,0,0.000,0,0,81.70,6
4446963,0,0,59.06,0,0,0,66,0,0,0.00,...,28,28,0,0.0000,0,2.184,0,0,788.70,4
4446964,0,4,180.40,1,1,2,11,2,1,98.50,...,26,25,2,0.0000,0,0.000,0,0,2748.00,8


In [6]:
#Randomly pick 80% samples as training set, 20% samples as test set
X_train,X_test,y_train,y_test = train_test_split(X.to_numpy(),y.to_numpy(),test_size = 0.2,random_state = 200)

In [9]:
print(X_train)

[[0 0 0.0 ... 0 1549.0 5]
 [0 1 253.0 ... 0 248.9 2]
 [0 2 173.0 ... 0 2356.0 4]
 ...
 [0 0 60.86 ... 0 17.35 1]
 [1 3 462.6 ... 0 2089.0 3]
 [0 0 0.0 ... 0 874.0 3]]


# Initial result

In [7]:
#From scikit-learn algorithm cheat-sheet
#>50 samples --> predicating a quantity --> larger than 100k samples
#So use SGD Regressor
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
sgd.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

In [8]:
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.0929849279584357


## matchType specification
\begin{align*}solo &= ranked-solo \\
normal-solo &= unranked-solo\\
duo &= ranked-duo\\
normal-duo &= unranked-duo\\
squad &= ranked-squad\\
normal-squad &= unranked-squad\\
solo-fpp &= ranked-solo-fpp\\
normal-solo-fpp &= unranked-solo-fpp\\
duo-fpp &= ranked-duo-fpp\\
normal-duo-fpp &= unranked-duo-fpp\\
squad-fpp &= ranked-squad-fpp\\
normal-squad-fpp &= unranked-squad-fpp\\
crashfpp &= Crash-Carnage-fpp(event-mode)\\
crashtpp &= Crash-Carnage-tpp(event-mode)\\
flaretpp &= Metal-Rain-tpp(event-mode)\\
flarefpp &= Metal-Rain-fpp(event-mode)
\end{align*}

## View the proportion of each mode in the dataset

In [9]:
#Original dataset
matchTypes = pd.DataFrame({'counts': train_data.groupby(('matchType')).size(),
                             'Percentage': train_data.groupby(('matchType')).size() / len(train_data)})

In [10]:
matchTypes

Unnamed: 0_level_0,counts,Percentage
matchType,Unnamed: 1_level_1,Unnamed: 2_level_1
crashfpp,6287,0.001414
crashtpp,371,8.3e-05
duo,313591,0.070518
duo-fpp,996691,0.224128
flarefpp,718,0.000161
flaretpp,2505,0.000563
normal-duo,199,4.5e-05
normal-duo-fpp,5489,0.001234
normal-solo,326,7.3e-05
normal-solo-fpp,1682,0.000378


In [11]:
#Filtered dataset
X,y,filtered = filterPUBG(train_data)

In [12]:
matchTypes_ = pd.DataFrame({'counts': filtered.groupby(('matchType')).size(),
                             'Percentage': filtered.groupby(('matchType')).size() / len(filtered)})
matchTypes_

Unnamed: 0_level_0,counts,Percentage
matchType,Unnamed: 1_level_1,Unnamed: 2_level_1
crashfpp,6287,0.001414
crashtpp,371,8.3e-05
duo,313591,0.070518
duo-fpp,996691,0.224128
flarefpp,718,0.000161
flaretpp,2505,0.000563
normal-duo,199,4.5e-05
normal-duo-fpp,5489,0.001234
normal-solo,326,7.3e-05
normal-solo-fpp,1682,0.000378


In [13]:
#Need to discuss about crashtpp, normal-duo, normal-solo, normal-squad

# SGD regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html

class sklearn.linear_model.SGDRegressor
(

    loss='squared_loss', //The loss function to be used. The possible values are ‘squared_loss’, ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’

    *, //not sure about this one

    penalty='l2', //The penalty (aka regularization term) to be used. Defaults to ‘l2’ which is the standard regularizer for linear SVM models. ‘l1’ and ‘elasticnet’ might bring sparsity to the model (feature selection) not achievable with ‘l2’

    alpha=0.0001, //Constant that multiplies the regularization term. The higher the value, the stronger the regularization. Also used to compute the learning rate when set to learning_rate is set to ‘optimal’.

    l1_ratio=0.15, //The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Only used if penalty is ‘elasticnet’. So it is NOT used in L2 regularization

    fit_intercept=True, //Whether the intercept should be estimated or not. If False, the data is assumed to be already centered.

    max_iter=1000, //The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the fit method, and not the partial_fit method.

    tol=0.001, //The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for n_iter_no_change consecutive epochs.

    shuffle=True, //Whether or not the training data should be shuffled after each epoch.

    verbose=0, //The verbosity level.

    epsilon=0.1, //Epsilon in the epsilon-insensitive loss functions; only if loss is ‘huber’, ‘epsilon_insensitive’, or ‘squared_epsilon_insensitive’.  So it is NOT used in squared_loss.

    random_state=None, //Used for shuffling the data, when shuffle is set to True. Pass an int for reproducible output across multiple function calls. 

    learning_rate='invscaling', 
                           //‘constant’: eta = eta0

                           //‘optimal’: eta = 1.0 / (alpha * (t + t0)) where t0 is chosen by a heuristic proposed by Leon Bottou.

                           //‘invscaling’: eta = eta0 / pow(t, power_t)

                           //‘adaptive’: eta = eta0, as long as the training keeps decreasing. Each time n_iter_no_change consecutive epochs fail to decrease the training loss by tol or fail to increase validation score by tol if early_stopping is True, the current learning rate is divided by 5.

    eta0=0.01, //The initial learning rate for the ‘constant’, ‘invscaling’ or ‘adaptive’ schedules. 

    power_t=0.25, //The exponent for inverse scaling learning rate.

    early_stopping=False, //Whether to use early stopping to terminate training when validation score is not improving.
    
    validation_fraction=0.1, //The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.

    n_iter_no_change=5, //Number of iterations with no improvement to wait before early stopping.

    warm_start=False, //When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.
    
    average=False, //When set to True, computes the averaged SGD weights accross all updates and stores the result in the coef_ attribute. If set to an int greater than 1, averaging will begin once the total number of samples seen reaches average.
       
)

## Apply cross-validation

In [14]:
eta0s = [0.1,0.01,0.001,0.0001]
for eta in eta0s:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3,eta0=eta))
    scores = cross_val_score(sgd, X_train, y_train, cv=5)
    print("eta0: ", eta, "scores: ", scores)

eta0:  0.1 scores:  [0.82655261 0.82560644 0.82743653 0.8272595  0.81454323]
eta0:  0.01 scores:  [0.82942873 0.82913242 0.82965194 0.82900976 0.82964177]
eta0:  0.001 scores:  [0.82967246 0.82914793 0.82990016 0.82927716 0.82983354]
eta0:  0.0001 scores:  [0.82901597 0.82854253 0.82927543 0.82853198 0.82914911]


In [15]:
##Not sure about which Hyper-parameter is more important.

## Investigate  cross_val_score

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html?highlight=cross_val_score#sklearn.model_selection.cross_val_score

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor.score

There is a parameter called scoring:

    scoring: str or callable, default=None

    A str (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y) which should return only a single value.

    If None, the estimator’s default scorer (if available) is used.

SGDRegressor's default scorer is R^2 score:

    Return the coefficient of determination R^2 of the prediction.

    The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.

## Change the scoring to MAE then apply cross-validation again 

In [22]:
#Try max_iter = 1000
eta0s = [0.1,0.01,0.001,0.0001]
error = 0
for eta in eta0s:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3,eta0=eta))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("eta0: ", eta, "scores: ", -scores," average: ",sum(-scores)/5)

eta0:  0.1 scores:  [0.09347237 0.09348396 0.09439204 0.09454859 0.09514487]  average:  0.09420836607051639
eta0:  0.01 scores:  [0.0929272  0.09295518 0.09303928 0.09300186 0.09288966]  average:  0.09296263756026216
eta0:  0.001 scores:  [0.09276496 0.09292791 0.09277639 0.0928977  0.09287261]  average:  0.09284791382376685
eta0:  0.0001 scores:  [0.09297381 0.09305983 0.09296249 0.09311636 0.09302999]  average:  0.0930284945591839


In [24]:
#Try max_iter = 2000
eta0s = [0.1,0.01,0.001,0.0001]
for eta in eta0s:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=2000, tol=1e-3,eta0=eta))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("eta0: ", eta, "scores: ", -scores," average: ",sum(-scores)/5)

eta0:  0.1 scores:  [0.09337347 0.09461451 0.11903918 0.09590951 0.09331909]  average:  0.0992511523536833
eta0:  0.01 scores:  [0.09271423 0.09314917 0.09284716 0.09299294 0.09303561]  average:  0.09294782264255796
eta0:  0.001 scores:  [0.09289242 0.09292184 0.09274406 0.09300078 0.09289947]  average:  0.09289171451434135
eta0:  0.0001 scores:  [0.09298131 0.09307905 0.09293143 0.09313365 0.09300081]  average:  0.09302525035525908


It seems that when eta0 = 0.001 the regressor will perform a bit better.

In [25]:
#Try alpha
alphas = [0.000001,0.00005,0.0001,0.00015,0.0002,0.00025,0.0005,0.001,0.01]
for alpha_ in alphas:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=2000, tol=1e-3,eta0=0.001,alpha=alpha_))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("alpha: ", alpha_, "scores: ", -scores," average: ",sum(-scores)/5)

alpha:  1e-06 scores:  [0.09275668 0.09287451 0.09279146 0.09292681 0.09283932]  average:  0.09283775628682625
alpha:  5e-05 scores:  [0.09277116 0.092903   0.0928251  0.09294457 0.09286643]  average:  0.09286205048044788
alpha:  0.0001 scores:  [0.0927717  0.09289647 0.09288666 0.09292952 0.09280557]  average:  0.09285798442135645
alpha:  0.00015 scores:  [0.09284416 0.09292879 0.09282329 0.09298804 0.0928309 ]  average:  0.09288303542191385
alpha:  0.0002 scores:  [0.09285666 0.09291746 0.09283925 0.09298863 0.09283987]  average:  0.09288837298930099
alpha:  0.00025 scores:  [0.09277082 0.09301445 0.09280954 0.09297799 0.09281642]  average:  0.09287784249231715
alpha:  0.0005 scores:  [0.09287744 0.09294873 0.09269053 0.09292426 0.09282172]  average:  0.09285253680882619
alpha:  0.001 scores:  [0.09287984 0.09300963 0.09291657 0.09300496 0.09281072]  average:  0.09292434387800644
alpha:  0.01 scores:  [0.09331087 0.0934096  0.0931762  0.0934284  0.09350053]  average:  0.0933651186027

## Try early stopping

Early stopping is based on the default scorer which is R^2 score, so it may not perfom good since MAE is needed.

In [26]:
#eta0
eta0s = [0.1,0.01,0.001,0.0001]
for eta in eta0s:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3,eta0=eta,early_stopping = True))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("eta0: ", eta, "scores: ", -scores," average: ",sum(-scores)/5)

eta0:  0.1 scores:  [0.09538977 0.09430707 0.0946585  0.09309383 0.09468387]  average:  0.09442660845508352
eta0:  0.01 scores:  [0.09274493 0.0931895  0.09302399 0.09299061 0.09310731]  average:  0.09301127117764274
eta0:  0.001 scores:  [0.09283578 0.09289118 0.09276602 0.09286069 0.09275853]  average:  0.09282244205005898
eta0:  0.0001 scores:  [0.09301605 0.09300217 0.0928898  0.09304443 0.0929877 ]  average:  0.0929880284818632


In [27]:
#Alpha
alphas = [0.000001,0.00005,0.0001,0.00015,0.0002,0.00025,0.0005,0.001,0.01]
for alpha_ in alphas:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3,eta0=0.001,alpha=alpha_, early_stopping=True))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("alpha: ", alpha_,"scores: ", -scores," average: ",sum(-scores)/5)

alpha:  1e-06 scores:  [0.0927708  0.09292276 0.09270619 0.0929284  0.09281537]  average:  0.0928287039351237
alpha:  5e-05 scores:  [0.09283786 0.09284199 0.0927457  0.09293961 0.092753  ]  average:  0.09282363152036885
alpha:  0.0001 scores:  [0.09273756 0.09291997 0.09276826 0.09295759 0.09288493]  average:  0.09285366343197693
alpha:  0.00015 scores:  [0.09275608 0.09297911 0.09276121 0.09288308 0.09287019]  average:  0.0928499365738242
alpha:  0.0002 scores:  [0.09279469 0.09291812 0.09277131 0.09286654 0.09283049]  average:  0.09283622833627704
alpha:  0.00025 scores:  [0.0927884  0.0928471  0.09273737 0.09291234 0.09280205]  average:  0.09281745368426572
alpha:  0.0005 scores:  [0.09282217 0.09288901 0.09275885 0.09288244 0.09283171]  average:  0.09283683686436378
alpha:  0.001 scores:  [0.09282138 0.09298407 0.09286746 0.09294257 0.09276316]  average:  0.09287572767390603
alpha:  0.01 scores:  [0.09340863 0.09345301 0.09319204 0.09349977 0.09327887]  average:  0.093366463639657

The difference is not significant, but eta0 = 0.001 and alpha = 0.000001 seem to be good

In [37]:
#Original sgd
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
sgd.fit(X_train, y_train)
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.09289768608374681


In [36]:
#Use the hyper-parameters to get the model
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3,eta0=0.001,alpha=0.000001))
sgd.fit(X_train, y_train)
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.09270853927040586


### How many patterns from the training set are seen during one iteration?

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor.score

Linear model fitted by minimizing a regularized empirical loss with SGD

SGD stands for Stochastic Gradient Descent: **the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate)**. *So there is only one pattern per iteration if I understand it correctly.*

The regularizer is a penalty added to the loss function that shrinks model parameters towards the zero vector using either the squared euclidean norm L2 or the absolute norm L1 or a combination of both (Elastic Net). If the parameter update crosses the 0.0 value because of the regularizer, the update is truncated to 0.0 to allow for learning sparse models and achieve online feature selection.

This implementation works with data represented as dense numpy arrays of floating point values for the features.

## Try to combine ranked(normal) data with unranked data

### New One-Hot Encoding specification
\begin{align*}solo &= 1000000000 \\
normal-solo &= 1000000000\\
duo &= 0100000000\\
normal-duo &= 0100000000\\
duo-fpp &= 0010000000\\
normal-duo-fpp &= 0010000000\\
squad &= 0001000000\\
normal-squad &= 0001000000\\
solo-fpp &= 0000100000\\
normal-solo-fpp &= 0000100000\\
squad-fpp &= 0000010000\\
normal-squad-fpp &= 0000010000\\
crashfpp &= 0000001000\\
flaretpp &= 0000000100\\
flarefpp &= 0000000010\\
crashtpp &= 0000000001
\end{align*}

In [3]:
def filterPUBG_combine_normal(train_data):
    
    #Delete irrelevant data: Id, GroupId, matchId
    #Delete inconsistent data (for now): rankPoints, killPoints, winPoints
    train_value = train_data.drop(["Id","groupId","matchId","rankPoints","killPoints","winPoints"],axis=1)

    #Delete the data with missing values (-1)
    train_value = train_value[train_value.select_dtypes(include=[np.number]).ge(0).all(1)]
    
    #Extract the target values
    y = train_value["winPlacePerc"]
    
    #Delete winPlacePerc
    filtered = train_value.drop(["winPlacePerc"],axis=1)
    
    #Do One-Hot Encoding for "matchType"
    X = filtered.replace(
        ["solo", "normal-solo","duo","normal-duo",
         "squad", "normal-squad","solo-fpp","normal-solo-fpp",
         "duo-fpp","normal-duo-fpp", "squad-fpp","normal-squad-fpp",
         "crashfpp","flaretpp","flarefpp","crashtpp"],
        ["1000000000", "1000000000", "0100000000", "0100000000", 
         "0001000000", "0001000000", "0000100000", "0000100000", 
         "0010000000", "0010000000", "0000010000", "0000010000",
         "0000001000", "0000000100", "0000000010", "0000000001"]
    )
    
    return X,y,filtered

In [4]:
#Get new coded data
X,y,filtered = filterPUBG_combine_normal(train_data)
X_train,X_test,y_train,y_test = train_test_split(X.to_numpy(),y.to_numpy(),test_size = 0.2,random_state = 200)

In [5]:
#I just assume eta0 = 0.001 from the last cross-validation and use 5000 iteration to save time
alphas = [0.000001,0.00005,0.0001,0.00015,0.0002,0.00025,0.0005,0.001,0.01]
for alpha_ in alphas:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=5000, tol=1e-3,eta0=0.001,alpha=alpha_, early_stopping=True))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("alpha: ", alpha_,"scores: ", -scores," average: ",sum(-scores)/5)

alpha:  1e-06 scores:  [0.09278411 0.09293264 0.09279443 0.09294249 0.09288752]  average:  0.09286823750278232
alpha:  5e-05 scores:  [0.0928379  0.09292628 0.09271595 0.09300407 0.09288814]  average:  0.0928744695648924
alpha:  0.0001 scores:  [0.09279098 0.09296401 0.09272704 0.09291371 0.09280993]  average:  0.09284113494787968
alpha:  0.00015 scores:  [0.09281395 0.09291551 0.09276842 0.09291186 0.09281089]  average:  0.09284412866713913
alpha:  0.0002 scores:  [0.09279739 0.09297195 0.09278352 0.09293373 0.0928449 ]  average:  0.09286629860404563
alpha:  0.00025 scores:  [0.09280957 0.09286886 0.09283336 0.09289333 0.09284841]  average:  0.09285070651771914
alpha:  0.0005 scores:  [0.09277748 0.09294851 0.09279434 0.09290042 0.09287325]  average:  0.09285879964655516
alpha:  0.001 scores:  [0.0928609  0.09284105 0.09279449 0.09306133 0.09288084]  average:  0.09288772473948385
alpha:  0.01 scores:  [0.09325389 0.09337787 0.09326588 0.09338062 0.09331919]  average:  0.09331949012711

In [6]:
#Compare again without early_stopping
alphas = [0.000001,0.00005,0.0001,0.00015,0.0002,0.00025,0.0005,0.001,0.01]
for alpha_ in alphas:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=5000, tol=1e-3,eta0=0.001,alpha=alpha_))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("alpha: ", alpha_,"scores: ", -scores," average: ",sum(-scores)/5)

alpha:  1e-06 scores:  [0.09280205 0.09297476 0.09277983 0.09291429 0.09282922]  average:  0.09286002869701118
alpha:  5e-05 scores:  [0.09281523 0.09297671 0.09279757 0.09291645 0.09289674]  average:  0.09288054213090167
alpha:  0.0001 scores:  [0.09279985 0.09302545 0.09287665 0.09296394 0.09282378]  average:  0.09289793573491542
alpha:  0.00015 scores:  [0.09286775 0.09293493 0.09306497 0.09293265 0.0928715 ]  average:  0.09293436196307604
alpha:  0.0002 scores:  [0.09290062 0.09292094 0.09283045 0.09303394 0.09286163]  average:  0.09290951594198238
alpha:  0.00025 scores:  [0.09276971 0.09297322 0.09284656 0.09296941 0.09289026]  average:  0.09288983312416178
alpha:  0.0005 scores:  [0.0928598  0.09290037 0.09276813 0.09294387 0.09286094]  average:  0.09286662019585014
alpha:  0.001 scores:  [0.09291572 0.09299943 0.09279015 0.09304168 0.09283415]  average:  0.09291622542937887
alpha:  0.01 scores:  [0.09332374 0.09338455 0.09327131 0.09329849 0.09330363]  average:  0.0933163421164

*Ignore the **early_stopping = False** group for now, need to discuss this*

In [13]:
#Original model
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3,eta0=0.001))
sgd.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor(eta0=0.001, max_iter=10000))])

In [14]:
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.0928535741733911


In [11]:
#After cross-validation, the only difference is actually the eta0
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3,eta0=0.001,alpha=0.0001,early_stopping = True))
sgd.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor',
                 SGDRegressor(early_stopping=True, eta0=0.001,
                              max_iter=10000))])

In [12]:
#Get the prediction
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.09273839952914875


### Remove the normal-X

### New One-Hot Encoding specification
\begin{align*}solo &= 1000000000 \\
duo &= 0100000000\\
duo-fpp &= 0010000000\\
squad &= 0001000000\\
solo-fpp &= 0000100000\\
squad-fpp &= 0000010000\\
crashfpp &= 0000001000\\
flaretpp &= 0000000100\\
flarefpp &= 0000000010\\
crashtpp &= 0000000001
\end{align*}

In [17]:
def filterPUBG_remove_normal(train_data):
    
    #Delete irrelevant data: Id, GroupId, matchId
    #Delete inconsistent data (for now): rankPoints, killPoints, winPoints
    train_value = train_data.drop(["Id","groupId","matchId","rankPoints","killPoints","winPoints"],axis=1)

    #Delete the data with missing values (-1)
    train_value = train_value[train_value.select_dtypes(include=[np.number]).ge(0).all(1)]
    
    #Delete normal-X
    train_value = train_value[~train_value.matchType.str.contains("normal")]
    
    #Extract the target values
    y = train_value["winPlacePerc"]
    
    #Delete winPlacePerc
    filtered = train_value.drop(["winPlacePerc"],axis=1)
    
    #Do One-Hot Encoding for "matchType"
    X = filtered.replace(
        ["solo", "duo",
         "squad", "solo-fpp",
         "duo-fpp","squad-fpp",
         "crashfpp","flaretpp",
         "flarefpp","crashtpp"],
        ["1000000000", "0100000000",
         "0001000000", "0000100000",
         "0010000000", "0000010000",
         "0000001000", "0000000100",
         "0000000010", "0000000001"]
    )
    
    return X,y,filtered

In [18]:
X,y,filtered = filterPUBG_remove_normal(train_data)
X_train,X_test,y_train,y_test = train_test_split(X.to_numpy(),y.to_numpy(),test_size = 0.2,random_state = 200)

In [19]:
#4446966 - 4421579 = 25387 rows are deleted
filtered

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,...,maxPlace,numGroups,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired
0,0,0,0.00,0,0,0,60,0,0,0.00,...,28,26,0,0.0000,0,0.000,0,0,244.80,1
1,0,0,91.47,0,0,0,57,0,0,0.00,...,26,25,0,0.0045,0,11.040,0,0,1434.00,5
2,1,0,68.00,0,0,0,47,0,0,0.00,...,50,47,0,0.0000,0,0.000,0,0,161.80,2
3,0,0,32.90,0,0,0,75,0,0,0.00,...,31,30,0,0.0000,0,0.000,0,0,202.70,3
4,0,0,100.00,0,0,0,45,1,1,58.53,...,97,95,0,0.0000,0,0.000,0,0,49.75,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,0,0,0.00,0,0,0,74,0,0,0.00,...,29,28,0,1292.0000,0,0.000,0,0,1019.00,3
4446962,0,1,44.15,0,0,0,69,0,0,0.00,...,93,93,0,0.0000,0,0.000,0,0,81.70,6
4446963,0,0,59.06,0,0,0,66,0,0,0.00,...,28,28,0,0.0000,0,2.184,0,0,788.70,4
4446964,0,4,180.40,1,1,2,11,2,1,98.50,...,26,25,2,0.0000,0,0.000,0,0,2748.00,8


In [20]:
alphas = [0.000001,0.00005,0.0001,0.00015,0.0002,0.00025,0.0005,0.001,0.01]
for alpha_ in alphas:
    sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=5000, tol=1e-3,eta0=0.001,alpha=alpha_))
    scores = cross_val_score(sgd, X_train, y_train, cv=5,scoring = "neg_mean_absolute_error")
    print("alpha: ", alpha_,"scores: ", -scores," average: ",sum(-scores)/5)

alpha:  1e-06 scores:  [0.09159203 0.09159169 0.09175164 0.091519   0.0917943 ]  average:  0.09164973257139183
alpha:  5e-05 scores:  [0.09162998 0.09155337 0.09182918 0.09145488 0.09186412]  average:  0.0916663063024197
alpha:  0.0001 scores:  [0.09159462 0.0914891  0.09178319 0.09153129 0.09179235]  average:  0.0916381103690549
alpha:  0.00015 scores:  [0.09161135 0.09153686 0.09182444 0.09153107 0.09185975]  average:  0.09167269447550258
alpha:  0.0002 scores:  [0.09165878 0.09157327 0.09175307 0.09152312 0.09183158]  average:  0.09166796122969405
alpha:  0.00025 scores:  [0.09168104 0.09153857 0.09179693 0.0915603  0.09180053]  average:  0.09167547398038622
alpha:  0.0005 scores:  [0.09162798 0.091606   0.0918801  0.09154544 0.09187296]  average:  0.09170649301571955
alpha:  0.001 scores:  [0.09164199 0.09157115 0.09187482 0.09153661 0.09184997]  average:  0.09169490947284416
alpha:  0.01 scores:  [0.09216428 0.09210004 0.09224914 0.092026   0.09234894]  average:  0.092177679329760

In [21]:
#Original model
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3))
sgd.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor(max_iter=10000))])

In [22]:
#Get the prediction
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.0916485812533149


In [23]:
#After Cross-validation, the only difference is actually the eta0
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=10000, tol=1e-3,eta0=0.001,alpha=0.0001,early_stopping = True))
sgd.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor',
                 SGDRegressor(early_stopping=True, eta0=0.001,
                              max_iter=10000))])

In [24]:
#Get the prediction
preds = sgd.predict(X_test)
MAE = mean_absolute_error(y_test,preds)
print(MAE)

0.0917030455950137
