In [10]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [11]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## 1차 전처리

In [12]:
train = pd.read_csv('/Users/sanghyun/Desktop/Coding/PUBG_ML_Project/data/featured_train_1.csv')
train = reduce_mem_usage(train)

In [13]:
check = train.copy()

In [14]:
includeAll = check.copy() # maxPlace 포함, numGroups 포함

dropMaxplace = check.copy() # maxPlace 미포함, numGroups 포함
dropNum = check.copy() # maxPlace 포함, numGroups 미포함

dropAll = check.copy() # maxPlace와 numGroups 미포함

In [15]:
# matchType는 Feature Engineering을 해야하는 칼럼이기 때문에 공통적으로 Drop

includeAll   = includeAll.drop(columns=['killPoints', 'winPoints'])
dropMaxplace = dropMaxplace.drop(columns=['killPoints', 'winPoints','maxPlace'])
dropNum      = dropNum.drop(columns=['killPoints', 'winPoints','numGroups'])
dropAll      = dropAll.drop(columns=['killPoints', 'winPoints','maxPlace','numGroups'])

In [16]:
## includeAll

X = includeAll.drop(columns='winPlacePerc')
y = includeAll['winPlacePerc']

X_Alltrain, X_Alltest, y_Alltrain, y_Alltest = \
train_test_split(X, y, test_size=0.2, random_state=100)

X_Alltrain.shape, X_Alltest.shape, y_Alltrain.shape, y_Alltest.shape

((3557572, 18), (889393, 18), (3557572,), (889393,))

In [17]:
model = LGBMRegressor()
model.fit(X_Alltrain, y_Alltrain)
pred = model.predict(X_Alltest)
mean_absolute_error(y_Alltest, pred)

0.0638663432806029

In [18]:
## dropMaxplace

X = dropMaxplace.drop(columns='winPlacePerc')
y = dropMaxplace['winPlacePerc']

X_dropMtrain, X_dropMtest, y_dropMtrain, y_dropMtest = \
train_test_split(X, y, test_size=0.2, random_state=100)

X_dropMtrain.shape, X_dropMtest.shape, y_dropMtrain.shape, y_dropMtest.shape

((3557572, 17), (889393, 17), (3557572,), (889393,))

In [19]:
model = LGBMRegressor()
model.fit(X_dropMtrain, y_dropMtrain)
pred = model.predict(X_dropMtest)
mean_absolute_error(y_dropMtest, pred)

0.06404962181939954

In [20]:
## dropNum

X = dropNum.drop(columns='winPlacePerc')
y = dropNum['winPlacePerc']

X_dropNumtrain, X_dropNumtest, y_dropNumtrain, y_dropNumtest = \
train_test_split(X, y, test_size=0.2, random_state=100)

X_dropNumtrain.shape, X_dropNumtest.shape, y_dropNumtrain.shape, y_dropNumtest.shape

((3557572, 17), (889393, 17), (3557572,), (889393,))

In [21]:
model = LGBMRegressor()
model.fit(X_dropNumtrain, y_dropNumtrain)
pred = model.predict(X_dropNumtest)
mean_absolute_error(y_dropNumtest, pred)

0.06510741118619552

In [22]:
## dropAll

X = dropAll.drop(columns='winPlacePerc')
y = dropAll['winPlacePerc']

X_dropAlltrain, X_dropAlltest, y_dropAlltrain, y_dropAlltest = \
train_test_split(X, y, test_size=0.2, random_state=100)

X_dropAlltrain.shape, X_dropAlltest.shape, y_dropAlltrain.shape, y_dropAlltest.shape

((3557572, 16), (889393, 16), (3557572,), (889393,))

In [23]:
model = LGBMRegressor()
model.fit(X_dropAlltrain, y_dropAlltrain)
pred = model.predict(X_dropAlltest)
mean_absolute_error(y_dropAlltest, pred)

0.07113038445049037

## 2차 전처리

In [24]:
fdata = reduce_mem_usage(pd.read_csv('/Users/sanghyun/Desktop/Coding/PUBG_ML_Project/data/featured_train_2.csv'))

In [25]:
X = fdata.drop(columns='winPlacePerc')
y = fdata['winPlacePerc']

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=100)

# X_featureTrain.shape, X_featureTest.shape, y_featureTrain.shape, y_featureTest.shape

In [26]:
model = LGBMRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)
mean_absolute_error(y_test, pred)

0.06808530824631952

## 개인 테스트(cat_numGroups)

In [27]:
dropNum = fdata.copy()

dropNum   = dropNum.drop(columns=['cat_numGroups'])

In [28]:
X = dropNum.drop(columns='winPlacePerc')
y = dropNum['winPlacePerc']

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=100)

# X_featureTrain.shape, X_featureTest.shape, y_featureTrain.shape, y_featureTest.shape

In [29]:
model = LGBMRegressor()
model.fit(X_train, y_train)
pred = model.predict(X_test)
mean_absolute_error(y_test, pred)

0.06904852944390456