In [52]:
import pandas as pd 
import numpy as np 
import boto3
from boto3.session import Session 
pd.set_option('display.max_columns', 500)

In [7]:
import configparser
Config = configparser.ConfigParser()
Config.read_file(open('/Users/milesklingenberg/Documents/Personal/AWS/AWS_Keys'))

In [None]:
#I am going to pull in the data from AWS as I am storing it in S3 due to size. 

In [8]:
KEY = Config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = Config.get('AWS', 'Secret')

In [14]:
s3client = boto3.client('s3', 
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET)
file = s3client.get_object(Bucket = 'pubg-competition', Key = 'train_V2.csv')
file_test = s3client.get_object(Bucket = 'pubg-competition', Key = 'test_V2.csv')

In [11]:
pubg_train = pd.read_csv(file['Body'])
pubg_train = pd.DataFrame(pubg_train)

In [15]:
pubg_test = pd.read_csv(file_test['Body'])
pubg_test = pd.DataFrame(pubg_test)

In [40]:
pubg_train['istrain'] = True
pubg_test['istrain'] = False

pubg_train = pubg_train[0:5000]
pubg_test = pubg_test[0:5000]

pubg = pd.concat([pubg_train, pubg_test])

#The data is too big to train locally, so apart from spinning up a cluster, I am just going to cut the data down. 

In [42]:
pubg.head(2)

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,istrain
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0.0,0,0.0,0,0,244.8,1,1466,0.4444,True
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0.0045,0,11.04,0,0,1434.0,5,0,0.64,True


In [39]:
pubg.isna().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
istrain            0
dtype: int64

In [20]:
len(pubg_test)

1934174

In [None]:
#Those NA are just the length of our testing data which makes sense. 

In [30]:
#In video games, especially PUBG I know that a team will be ranked based of the best performing player. 
#For instance, there are teams of four, or could be potentially be teams of between 1 and 4. You can have group games, 
#or solo games, but regardless we will want to aggregate based on the group ID as opposed to the indivudal player.

pubg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6381140 entries, 0 to 1934173
Data columns (total 30 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int64  
 4   boosts           int64  
 5   damageDealt      float64
 6   DBNOs            int64  
 7   headshotKills    int64  
 8   heals            int64  
 9   killPlace        int64  
 10  killPoints       int64  
 11  kills            int64  
 12  killStreaks      int64  
 13  longestKill      float64
 14  matchDuration    int64  
 15  matchType        object 
 16  maxPlace         int64  
 17  numGroups        int64  
 18  rankPoints       int64  
 19  revives          int64  
 20  rideDistance     float64
 21  roadKills        int64  
 22  swimDistance     float64
 23  teamKills        int64  
 24  vehicleDestroys  int64  
 25  walkDistance     float64
 26  weaponsAcquired  int64  
 27  winPoints   

In [44]:
train = pubg[pubg['istrain']==True]

In [45]:
train.head(2)

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,istrain
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0.0,0,0.0,0,0,244.8,1,1466,0.4444,True
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0.0045,0,11.04,0,0,1434.0,5,0,0.64,True


In [94]:
train_agg = train.sort_values(by = 'winPlacePerc', ascending = True)
train_agg = train.groupby('groupId').agg({'assists':['sum'], 'boosts':['sum'], 'damageDealt':['sum'],
                              'DBNOs':['sum'], 'headshotKills':['sum'], 'heals':['sum'], 
                              'kills':['sum'], 'killStreaks':['sum'], 'longestKill':['sum'], 
                             'matchDuration':['mean'], 'maxPlace':['mean'], 'numGroups':['mean'], 
                             'rankPoints':['mean'], 'revives':['sum'], 'rideDistance':['sum'], 'roadKills':['sum'],
                             'swimDistance':['sum'], 'teamKills':['mean'], 'vehicleDestroys':['sum'], 
                             'walkDistance':['sum'], 'weaponsAcquired':['sum'], 'winPoints':['sum'],'winPlacePerc':['mean']}).reset_index()

In [101]:
train_agg.columns = ['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'winPlacePerc']

In [103]:
train_agg.sort_values(by='winPlacePerc', ascending=True)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,rankPoints.1,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
3205,a4e8ebd0ed06f6,0,0,0.00,0,0,0,0,0,0.00,1286,28,27,1373.0,0,0.0,0,0.0,0,0,86.17,2,0,0.0
3948,cb4442f14218f3,0,0,0.00,0,0,0,0,0,0.00,1423,47,46,1486.0,0,0.0,0,0.0,0,0,0.00,0,0,0.0
1279,3f4a7b1c246392,0,0,44.05,0,0,0,0,0,0.00,1467,48,44,1252.0,0,0.0,0,0.0,0,0,73.07,2,0,0.0
3970,cc264762b1879e,0,0,0.00,0,0,0,0,0,0.00,1955,28,28,1481.0,0,0.0,0,0.0,0,0,0.00,0,0,0.0
1285,3fa2950e698c4b,0,0,0.00,0,0,0,0,0,0.00,2001,27,26,1500.0,0,0.0,0,0.0,0,0,763.80,2,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4270,dbeb5d3b28c1d1,1,6,391.50,3,0,2,4,2,92.76,1848,42,42,1490.0,0,2062.0,0,0.0,0,0,3850.00,9,0,1.0
4267,dbcbe9223c0f90,1,1,57.98,0,0,1,1,1,27.14,1782,30,28,1538.0,0,1702.0,0,0.0,0,0,1054.00,6,0,1.0
4257,db0c253f8a8d1e,2,8,757.10,4,2,2,9,2,190.80,1919,28,26,1456.0,0,5733.0,0,0.0,0,1,2654.00,4,0,1.0
3891,c81ec9a336aaaf,0,7,217.50,1,0,4,0,0,0.00,1434,28,28,1375.0,2,0.0,0,0.0,0,0,2143.00,4,0,1.0
