In [52]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [53]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [54]:
# raw trian으로 통일

In [55]:
train = pd.read_csv('data/train_V2.csv')
train = reduce_mem_usage(train)

In [56]:
# killPlace Drop

train.drop(['killPlace'], axis=1, inplace=True)

## 담당 칼럼 전처리

### 석민

### 윤아

In [57]:
# headshotKills
train = train.drop(train[train.headshotKills > 20].index)

# longestKills
train = train.drop(train[(train.longestKill == 0) & (train.kills != 0)].index)

# totalDistance
train['totalDistance'] = train['rideDistance'] + train['walkDistance'] + train['swimDistance']

### 세연

### 상현

In [58]:
## matchType Feature_engineering

train['matchType2'] = pd.NaT
maplist = {
    'squad' : 'squad',
    'duo' : 'duo',
    'solo' : 'solo',
    'crash' : 'event',
    'flare' : 'event'
}

for i in maplist:
  train.matchType2[(train['matchType'].str.contains(i)==True)&(train['matchType2'].isnull())]=maplist.get(i)


# solo ⮕ 1, duo ⮕ 2, squad ⮕ 4, falre & crash ⮕ 0 으로 전환
def convert(x):
    if x == "squad":
        return 4
    elif x == "duo":
        return 2
    elif x == "solo":
        return 1
    else : 
        return 0
    
train['matchType2'] = train['matchType2'].map(convert)


# 기존의 matchType drop
train.drop(['matchType'], axis=1, inplace=True)

# matchType2 rename, reindex
train.rename(columns = {'matchType2':'matchType'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.matchType2[(train['matchType'].str.contains(i)==True)&(train['matchType2'].isnull())]=maplist.get(i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.matchType2[(train['matchType'].str.contains(i)==True)&(train['matchType2'].isnull())]=maplist.get(i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.matchType2[(train['matchType'].str.contains(i)==True)&(train['matchType2'].isnull())]=maplist.get(i)
A value is trying to be set on a copy of a slice from a

In [59]:
## vehicleDestroys Feature_engineering

# cat_vehicleDestroys 칼럼을 생성
train['cat_vehicleDestroys'] = pd.NaT

train.loc[train['vehicleDestroys'] == 0, 'cat_vehicleDestroys'] = 0
train.loc[(train['vehicleDestroys'] >= 1) & (train['vehicleDestroys'] <= 2), 'cat_vehicleDestroys'] = 1
train.loc[train['vehicleDestroys'] >= 3, 'cat_vehicleDestroys'] = 3

# destroy.drop(columns=['vehicleDestroys'])
train['cat_vehicleDestroys'] = train['cat_vehicleDestroys'].astype(int)


In [60]:
# weaponsAcquired

# cat_weaponsAcquired 칼럼을 생성
train['cat_weaponsAcquired'] = pd.NaT

train.loc[train['weaponsAcquired'] == 0, 'cat_weaponsAcquired'] = 0
train.loc[train['weaponsAcquired'] == 1, 'cat_weaponsAcquired'] = 1
train.loc[(train['weaponsAcquired'] >= 2) & (train['weaponsAcquired'] <= 3), 'cat_weaponsAcquired'] = 2
train.loc[(train['weaponsAcquired'] >= 4) & (train['weaponsAcquired'] <= 5), 'cat_weaponsAcquired'] = 4
train.loc[(train['weaponsAcquired'] >= 6) & (train['weaponsAcquired'] <= 7), 'cat_weaponsAcquired'] = 6
train.loc[(train['weaponsAcquired'] >= 8) & (train['weaponsAcquired'] <= 9), 'cat_weaponsAcquired'] = 8
train.loc[train['weaponsAcquired'] >= 10, 'cat_weaponsAcquired'] = 10

# destroy.drop(columns=['weaponsAcquired'])

train['cat_weaponsAcquired'] = train['cat_weaponsAcquired'].astype(int)


### 승범

In [61]:
# train.drop(['killStreaks'], axis=1, inplace=True)

## 겹치는 칼럼 공통 처리

In [62]:
# kills _ 윤아님 처리 이후 승범이 처리

train['killsWithoutMoving'] = ((train['kills'] > 0) & (train['totalDistance'] == 0))
train = train.drop(train[train.killsWithoutMoving == True].index)

train.loc[train.kills >= 8,'kills'] = 8

In [63]:
# Distance _ 윤아님 처리 이후 상현 처리

## 윤아님 처리 코드
# walkDistance
trian = train.drop(train[train.walkDistance > 12000].index)

# swimDistance
trian = trian.drop(trian[trian.swimDistance > 2000].index)

# rideDistance
trian = trian.drop(trian[trian.rideDistance > 20000].index)


## 상현 처리 코드
# walkDistance 피쳐
trian['cat_walkDistance'] = pd.NaT

trian.loc[trian['walkDistance'] < 500, 'cat_walkDistance'] = 1
trian.loc[(trian['walkDistance'] >= 500) & (trian['walkDistance'] < 1000), 'cat_walkDistance'] = 2
trian.loc[(trian['walkDistance'] >= 1000) & (trian['walkDistance'] < 2000), 'cat_walkDistance'] = 3
trian.loc[(trian['walkDistance'] >= 2000) & (trian['walkDistance'] < 5000), 'cat_walkDistance'] = 4
trian.loc[trian['walkDistance'] >= 5000, 'cat_walkDistance'] = 5

trian['cat_walkDistance'] = trian['cat_walkDistance'].astype(int)

## rideAndswim Feature_engineering

# sum_rideAndswim 칼럼을 생성
train['sum_rideAndswim'] = pd.NaT
train['sum_rideAndswim'] = train['rideDistance'] + train['swimDistance']

# binary_rideAndswim 칼럼을 생성
train['binary_rideAndswim'] = pd.NaT

train.loc[train['sum_rideAndswim'] == 0, 'binary_rideAndswim'] = 0
train.loc[train['sum_rideAndswim'] > 0, 'binary_rideAndswim'] = 1

train['binary_rideAndswim'] = train['binary_rideAndswim'].astype(int)

### 새로운 픽쳐를 만드는데 사용한 칼럼들 Drop

In [64]:
pd.set_option('display.max_columns', 40)

train.head(5)

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance,matchType,cat_vehicleDestroys,cat_weaponsAcquired,killsWithoutMoving,sum_rideAndswim,binary_rideAndswim
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,1241,0,0,0.0,1306,28,26,-1,0,0.0,0,0.0,0,0,244.75,1,1466,0.444336,244.75,4,0,1,False,0.0,0
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.5,0,0,0,0,0,0,0.0,1777,26,25,1484,0,0.004501,0,11.039062,0,0,1434.0,5,0,0.640137,1445.0,4,0,4,False,11.046875,1
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,0,0,0,0.0,1318,50,47,1491,0,0.0,0,0.0,0,0,161.75,2,0,0.775391,161.75,2,0,2,False,0.0,0
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90625,0,0,0,0,0,0,0.0,1436,31,30,1408,0,0.0,0,0.0,0,0,202.75,3,0,0.166748,202.75,4,0,2,False,0.0,0
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,0,1,1,58.53125,1424,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875,49.75,1,0,2,False,0.0,0


In [66]:
train = train.drop(columns=['rankPoints', 'roadKills', 'matchDuration', 'teamKills', 'killPoints', 'winPoints', 'maxPlace','sum_rideAndswim','weaponsAcquired','vehicleDestroys','walkDistance', 'swimDistance', 'rideDistance','killsWithoutMoving'])

In [67]:
pd.set_option('display.max_columns', 40)

train.head(5)

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,killStreaks,longestKill,numGroups,revives,winPlacePerc,totalDistance,matchType,cat_vehicleDestroys,cat_weaponsAcquired,binary_rideAndswim
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,0,0,0.0,26,0,0.444336,244.75,4,0,1,0
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.5,0,0,0,0,0,0.0,25,0,0.640137,1445.0,4,0,4,1
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,0,0,0.0,47,0,0.775391,161.75,2,0,2,0
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90625,0,0,0,0,0,0.0,30,0,0.166748,202.75,4,0,2,0
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,1,1,58.53125,95,0,0.1875,49.75,1,0,2,0
