# How to do?

- killPlace의 경우 data leakage로 판단되기 때문에 coulmn drop
- outlier raw drop
- Make new feature : heals_boosts, killsWithoutMoving, totalDistance, killsNorm, damageDealtNorm
    - heals_boosts : df['heals'] + df['boosts']
    - killsWithoutMoving : ((df['killsNorm'] > 0) & (df['totalDistance'] == 0))
    - totalDistance : df['rideDistance'] + df['walkDistance'] + df['swimDistance']
    - killsNorm : df.kills * ((100 - df.playersJoined)/100 + 1)
    - damageDealtNorm : df.damageDealt * ((100 - df.playersJoined)/100 + 1)
- change to categorical feature : vehicleDestroys, weaponsAcquired, walkDistance, binary_rideAndswim, 
- LightGBM, RandomForest, SVR, H2O의 경우 Modeling 시간이 오래 걸리기 때문에 제외

# 공통처리

## import

In [9]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

## memoryReduce

In [2]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Set dataFrame

In [3]:
train = pd.read_csv('data/featured_train_2.csv')
train = reduce_mem_usage(train)

In [4]:
df = train.copy()

## Missing data processing

In [7]:
print("* 데이터 타입, 데이터프레임 정보 확인")
print("----------------------------------------------------------------------")
df.info()
print()
print("* 데이터 칼럼별 결측값 확인")
print("----------------------------------------------------------------------")
df.isnull().sum()

* 데이터 타입, 데이터프레임 정보 확인
----------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4446965 entries, 0 to 4446965
Data columns (total 20 columns):
 #   Column           Dtype  
---  ------           -----  
 0   assists          int8   
 1   boosts           int8   
 2   damageDealt      float16
 3   DBNOs            int8   
 4   headshotKills    int8   
 5   heals            int8   
 6   killPlace        int8   
 7   killPoints       int16  
 8   kills            int8   
 9   killStreaks      int8   
 10  longestKill      float16
 11  numGroups        int8   
 12  revives          int8   
 13  rideDistance     float16
 14  swimDistance     float16
 15  vehicleDestroys  int8   
 16  walkDistance     float16
 17  weaponsAcquired  int16  
 18  winPoints        int16  
 19  winPlacePerc     float16
dtypes: float16(6), int16(3), int8(11)
memory usage: 156.9 MB

* 데이터 칼럼별 결측값 확인
-----------------------------------------------------

assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
numGroups          0
revives            0
rideDistance       0
swimDistance       0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64

# Data Split

In [8]:
X = df.drop(columns='winPlacePerc')
y = df['winPlacePerc']

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=100)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3557572, 19), (889393, 19), (3557572,), (889393,))

# Modeling

## LinearRegression

In [10]:
# LinearRegression

model_lr = LinearRegression().fit(X_train, y_train)
pred = model_lr.predict(X_test)
mae = mean_absolute_error(pred, y_test)
print('LinearRegression MAE : %f' %mae)

LinearRegression MAE : 0.097497


## Lasso

In [11]:
# Lasso

model_ls = Lasso().fit(X_train, y_train)
pred2 = model_ls.predict(X_test)
mae = mean_absolute_error(pred2, y_test)
print('Lasso MAE : %f' %mae)

Lasso MAE : 0.126115


## Ridge

In [12]:
# Ridge

model_rg = Ridge().fit(X_train, y_train)
pred3 = model_rg.predict(X_test)
mae = mean_absolute_error(pred3, y_test)
print('Ridge MAE : %f' %mae)

Ridge MAE : 0.097497


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


## LightGBM

In [13]:
# LGBMRegressor

model_lgb = LGBMRegressor().fit(X_train, y_train)
pred4 = model_lgb.predict(X_test)
mae = mean_absolute_error(pred4, y_test)
print('LGBMRegressor MAE : %f' %mae)

LGBMRegressor MAE : 0.064037


## XGBoost

In [14]:
# XGBRegressor

model_xgb = XGBRegressor().fit(X_train, y_train)
pred5 = model_xgb.predict(X_test)
mae = mean_absolute_error(pred5, y_test)
print('XGBRegressor MAE : %f' %mae)

XGBRegressor MAE : 0.062883
