# Import

In [46]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Load data

In [9]:
train_data = pd.read_csv('../Datasets/train_V2.csv')

# Data Fields
DBNOs - Number of enemy players knocked.

assists - Number of enemy players this player damaged that were killed by teammates.

boosts - Number of boost items used.

damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.

headshotKills - Number of enemy players killed with headshots.

heals - Number of healing items used.

Id - Player’s Id

killPlace - Ranking in match of number of enemy players killed.

killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.

killStreaks - Max number of enemy players killed in a short amount of time.

kills - Number of enemy players killed.

longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.

matchDuration - Duration of match in seconds.

matchId - ID to identify match. There are no matches that are in both the training and testing set.

matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.

rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.

revives - Number of times this player revived teammates.

rideDistance - Total distance traveled in vehicles measured in meters.

roadKills - Number of kills while in a vehicle.

swimDistance - Total distance traveled by swimming measured in meters.

teamKills - Number of times this player killed a teammate.

vehicleDestroys - Number of vehicles destroyed.

walkDistance - Total distance traveled on foot measured in meters.

weaponsAcquired - Number of weapons picked up.

winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.

groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.

numGroups - Number of groups we have data for in the match.

maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.

winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

# Pre-processing data

**1.Delete sample with missing data(-1)**

**2.Delete irrelevant data such as ID**

**3.Do One-Hot Encoding for "matchType"**

**4.Delete inconsistent values such as rankPoints**

In [10]:
# View the data
train_data

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.00,0,0,0,60,...,0,0.0000,0,0.000,0,0,244.80,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.040,0,0,1434.00,5,0,0.6400
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.00,0,0,0,47,...,0,0.0000,0,0.000,0,0,161.80,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90,0,0,0,75,...,0,0.0000,0,0.000,0,0,202.70,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.00,0,0,0,45,...,0,0.0000,0,0.000,0,0,49.75,2,0,0.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,afff7f652dbc10,d238e426f50de7,18492834ce5635,0,0,0.00,0,0,0,74,...,0,1292.0000,0,0.000,0,0,1019.00,3,1507,0.1786
4446962,f4197cf374e6c0,408cdb5c46b2ac,ee854b837376d9,0,1,44.15,0,0,0,69,...,0,0.0000,0,0.000,0,0,81.70,6,0,0.2935
4446963,e1948b1295c88a,e26ac84bdf7cef,6d0cd12784f1ab,0,0,59.06,0,0,0,66,...,0,0.0000,0,2.184,0,0,788.70,4,0,0.4815
4446964,cc032cdd73b7ac,c2223f35411394,c9c701d0ad758a,0,4,180.40,1,1,2,11,...,2,0.0000,0,0.000,0,0,2748.00,8,0,0.8000


## One-Hot Encoding specification
\begin{align*}solo &= 1000000000000000 \\
duo &= 0100000000000000\\
squad &= 0010000000000000\\
solo-fpp &= 0001000000000000\\
duo-fpp &= 0000100000000000\\
squad-fpp &= 0000010000000000\\
normal-solo-fpp &= 0000001000000000\\
crashfpp &= 0000000100000000\\
flaretpp &= 0000000010000000\\
normal-squad-fpp &= 0000000001000000\\
normal-duo-fpp &= 0000000000100000\\
normal-squad &= 0000000000010000\\
flarefpp &= 0000000000001000\\
crashtpp &= 0000000000000100\\
normal-duo &= 0000000000000010\\
normal-solo &= 0000000000000001
\end{align*}









In [32]:
#Delete the data with missing values (-1)
train_value = train_data[train_data.select_dtypes(include=[np.number]).ge(0).all(1)]

#Extract target value, Use 2000000 samples (73%) as training set, 745155 samples (27%) as validation set
val_target_data = train_value["winPlacePerc"].tail(745155)
train_target_data = train_value["winPlacePerc"].head(2000000)

#Delete irrelevant data: Id, GroupId, matchId
#Delete inconsistent data (for now): rankPoints, killPoints, winPoints
train_value = train_value.drop(["Id","groupId","matchId","rankPoints","killPoints","winPoints","winPlacePerc"],axis=1)

#Do One-Hot Encoding for "matchType"
train_value = train_value.replace(
    ["solo", "duo", "squad", 
     "solo-fpp", "duo-fpp", "squad-fpp",
     "normal-solo-fpp","crashfpp","flaretpp",
     "normal-squad-fpp","normal-duo-fpp","normal-squad",
     "flarefpp","crashtpp","normal-duo",
     "normal-solo"],
    ["1000000000000000", "0100000000000000", "0010000000000000", 
     "0001000000000000", "0000100000000000", "0000010000000000",
     "0000001000000000", "0000000100000000", "0000000010000000",
     "0000000001000000", "0000000000100000", "0000000000010000",
     "0000000000001000", "0000000000000100", "0000000000000010",
     "0000000000000001"]
)

#Use 2000000 samples (73%) as training set, 745155 samples (27%) as validation set
val_data = train_value.tail(745155)
train_value = train_value.head(2000000)

In [33]:
#Convert pandas DataFrame to numpy array
train_value = train_value.to_numpy()
train_target_value = train_target_data.to_numpy()

val_value = val_data.to_numpy()
val_target_value = val_target_data.to_numpy()

In [47]:
#From scikit-learn algorithm cheat-sheet
#>50 samples --> predicating a quantity --> larger than 100k samples
#So use SGD Regressor
sgd = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
sgd.fit(train_value, train_target_value)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

In [51]:
#Test with validation set
preds = sgd.predict(val_value)
MSE = mean_squared_error(val_target_data,preds)
print(MSE)

0.01615470703645259
