# How to do?

- matchID, groupID 별로 묶음
- 묶은 값들의 med값을 찾기 위해 새로운 칼럼 생성

# 공통처리

## import

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

## memoryReduce

In [4]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Set dataFrame

In [6]:
train = pd.read_csv('data/featured_train_4.csv')
train = reduce_mem_usage(train)

In [7]:
df = train.copy()

## Missing data processing

In [8]:
print("* 데이터 타입, 데이터프레임 정보 확인")
print("----------------------------------------------------------------------")
df.info()
print()
print("* 데이터 칼럼별 결측값 확인")
print("----------------------------------------------------------------------")
df.isnull().sum()

* 데이터 타입, 데이터프레임 정보 확인
----------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026744 entries, 0 to 2026743
Data columns (total 47 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   assists_mean               float16
 1   boosts_mean                float16
 2   damageDealt_mean           float16
 3   DBNOs_mean                 float16
 4   headshotKills_mean         float16
 5   heals_mean                 float16
 6   killPlace_mean             float16
 7   killPoints_mean            float16
 8   kills_mean                 float16
 9   killStreaks_mean           float16
 10  longestKill_mean           float16
 11  maxPlace_mean              float16
 12  numGroups_mean             float16
 13  rankPoints_mean            float16
 14  revives_mean               float16
 15  rideDistance_mean          float16
 16  roadKills_mean             float16
 17  swimDistance_mean          f

assists_mean                 0
boosts_mean                  0
damageDealt_mean             0
DBNOs_mean                   0
headshotKills_mean           0
heals_mean                   0
killPlace_mean               0
killPoints_mean              0
kills_mean                   0
killStreaks_mean             0
longestKill_mean             0
maxPlace_mean                0
numGroups_mean               0
rankPoints_mean              0
revives_mean                 0
rideDistance_mean            0
roadKills_mean               0
swimDistance_mean            0
teamKills_mean               0
vehicleDestroys_mean         0
walkDistance_mean            0
weaponsAcquired_mean         0
winPoints_mean               0
assists_mean_rank            0
boosts_mean_rank             0
damageDealt_mean_rank        0
DBNOs_mean_rank              0
headshotKills_mean_rank      0
heals_mean_rank              0
killPlace_mean_rank          0
killPoints_mean_rank         0
kills_mean_rank              0
killStre

# Data Split

In [9]:
X = df.drop(columns='winPlacePerc')
y = df['winPlacePerc']

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=100)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1621395, 46), (405349, 46), (1621395,), (405349,))

# Modeling

## LinearRegression

In [10]:
# LinearRegression

model_lr = LinearRegression().fit(X_train, y_train)
pred = model_lr.predict(X_test)
mae = mean_absolute_error(pred, y_test)
print('LinearRegression MAE : %f' %mae)

LinearRegression MAE : 0.058066


## Lasso

In [11]:
# Lasso

model_ls = Lasso().fit(X_train, y_train)
pred2 = model_ls.predict(X_test)
mae = mean_absolute_error(pred2, y_test)
print('Lasso MAE : %f' %mae)

Lasso MAE : 0.114787


## Ridge

In [12]:
# Ridge

model_rg = Ridge().fit(X_train, y_train)
pred3 = model_rg.predict(X_test)
mae = mean_absolute_error(pred3, y_test)
print('Ridge MAE : %f' %mae)

Ridge MAE : 0.058067


## LightGBM

In [13]:
# LGBMRegressor

model_lgb = LGBMRegressor().fit(X_train, y_train)
pred4 = model_lgb.predict(X_test)
mae = mean_absolute_error(pred4, y_test)
print('LGBMRegressor MAE : %f' %mae)

LGBMRegressor MAE : 0.046707


## XGBoost

In [14]:
# XGBRegressor

model_xgb = XGBRegressor().fit(X_train, y_train)
pred5 = model_xgb.predict(X_test)
mae = mean_absolute_error(pred5, y_test)
print('XGBRegressor MAE : %f' %mae)

XGBRegressor MAE : 0.044343


## RandomForest

In [15]:
# RandomForestRegressor

model_rfr = RandomForestRegressor().fit(X_train, y_train)
pred6 = model_rfr.predict(X_test)
mae = mean_absolute_error(pred6, y_test)
print('RandomForestRegressor MAE : %f' %mae)

RandomForestRegressor MAE : 0.044171
