In [2]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-3.3.2.tar.gz (1.5 MB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (setup.py) ... [?25ldone
[?25h  Created wheel for lightgbm: filename=lightgbm-3.3.2-py3-none-any.whl size=1076856 sha256=c03876fb57c89fc8b51c25e4394f714bfaaba6c3e6c3b22bac6bd903ce6c4375
  Stored in directory: /Users/krc/Library/Caches/pip/wheels/ff/26/5f/2a30250ade19f331dfb9c629cc7b7325665878821437e4275c
Successfully built lightgbm
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2


In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.feature_selection import r_regression
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [4]:
base_path = 'data/'

def load_dataset(csv_name):
   try:
     df = pd.read_csv(base_path+csv_name)
   except: 
     raise Exception("csv파일명을 입력하세요!")
   return df

In [12]:
def preprocess(df):
    df = __delete_nan_data(df)
    df =__remove_outlier(df)
    #new_col_name = "match_types"
    #df[new_col_name] = __convert_match_type_column(df,"matchType")
    #df = __change_nan_points(df)
    #df = __one_hot_encode_data_frame(df, new_col_name)
    df = __select_features(df)
    return df

  
def __delete_nan_data(df):
    return df.dropna()

  
def __convert_match_type_column(prepro_df,encoding_feature):
    encoded = prepro_df[encoding_feature].agg(preprocessing_match_type)
    return encoded

  
def preprocessing_match_type(match_type):
    standard_matches = ["solo", "duo", "squad", "solo-fpp", "duo-fpp", "squad-fpp"]
    if match_type in standard_matches:
        return match_type
    else:
        return "others" 

      
def __change_nan_points(df):
    kill_rank_win_points = ["killPoints", "rankPoints", "winPoints"]
    match_types_list = list(df.match_types.unique())
    for col in kill_rank_win_points:
        if col != "rankPoints":
            cond0 = df[col] == 0
            cond1 = df[col] != 0
        else:
            cond0 = df[col] == -1
            cond1 = df[col] != -1
        for m_type in match_types_list:
            cond2 = df.match_types == m_type
            mean = df[cond1 & cond2][col].mean()
            std = df[cond1 & cond2][col].std()
            size = df[cond0 & cond2][col].count()
            if m_type != 'others' or col == "rankPoints":
                rand_points = np.random.randint(mean-std, mean+std, size=size)
            else:
                rand_points = np.array([mean]*size)
            df[col].loc[cond0 & cond2] = rand_points
    return df

  
def __one_hot_encode_data_frame(df, encoding_feature):
    df = pd.get_dummies(df, columns=[encoding_feature])
    return df


def __select_features(df):
    main_columns = ["winPlacePerc", "walkDistance", "killPlace", "boosts", "heals", "kills", "killStreaks", "longestKill", "rideDistance"]
    #main_columns = ["winPlacePerc", "walkDistance", "boosts", "heals", "kills", "killStreaks", "longestKill", "rideDistance"]
    sub_columns = ["weaponsAcquired", "damageDealt", "headshotKills", "assists", "DBNOs"]
    others_columns = ["maxPlace","numGroups","revives","roadKills","swimDistance","teamKills","vehicleDestroys","revives","weaponsAcquired"]
    #main_columns = ["winPlacePerc", "walkDistance", "boosts", "weaponsAcquired"]
    #kill_columns = ["kills", "damageDealt"]
    match_type_columns = df.columns[df.columns.str.contains("match_types")]
    #deleted_columns = df[["Id","groupId","matchId"]]
    #deleted_columns = df[["Id","groupId","matchId","killPlace"]]
    #deleted_columns = df[["Id","groupId","matchId","matchType","killPlace"]]
    deleted_columns = df[["Id","groupId","matchId","matchType","killPoints","matchDuration","maxPlace","numGroups","rankPoints","teamKills","winPoints","weaponsAcquired","revives","roadKills"]]
    #deleted_columns = list(set(df.columns)-set(main_columns))
    #deleted_columns = list(set(df.columns)-set(main_columns)-set(sub_columns))
    #deleted_columns = list(set(df.columns)-set(main_columns)-set(match_type_columns))
    #deleted_columns = list(set(df.columns)-set(main_columns)-set(sub_columns)-set(match_type_columns))
    #deleted_columns = list(set(df.columns)-set(main_columns)-set(sub_columns)-set(match_type_columns)-set(others_columns))
    return df.drop(columns=deleted_columns)


def __remove_outlier(df):
    df = df[(((df['winPlacePerc']+0.5) / (df['walkDistance'])) > 1/13000) & (df['walkDistance']>1) ]
    df = df[((df['winPlacePerc'] / (df['killPlace']-103)) > -1/42)]
    df = df[((df['winPlacePerc']+ 0.51 ) / (df['boosts'])) > 1/17 ]
    df = df[((df['winPlacePerc']+0.5 ) / (df['heals'])) > 1/50 ]
    df = df[((df['winPlacePerc']+0.8 ) / (df['kills'])) > 1/30]
    df = df[((df['winPlacePerc']+1.7 ) / (df['killStreaks'])) > 1/6]
    df = df[((df['winPlacePerc']+ 0.75) / (df['longestKill'])) > 1/600]
    df = df[((df['winPlacePerc']+0.73 ) / (df['rideDistance'])) > 1/24000]
    df = df[((df['winPlacePerc']+ 3.8 ) / (df['weaponsAcquired'])) > 1/20 ]
    df = df[((df['winPlacePerc']+ 0.25 ) / (df['headshotKills'])) > 1/40 ]
    df = df[((df['winPlacePerc']+ 2.8 ) / (df['assists'])) > 1/5 ]
    df = df[((df['winPlacePerc']+ 1.25 ) / (df['DBNOs'])) > 1/16 ]
    return df

In [13]:
def learn_ann_model(df):
  X_train, X_val, y_train, y_val = __divide_training_data(df)
  #LGBM_reg = LGBMRegressor().fit(X_train,y_train)
  LGBM_reg = LGBMRegressor(
    max_depth=11,
    num_leaves=150,
    colsample_bytree=1,
    learning_rate=0.1,
    n_estimators=500
  ).fit(X_train,y_train)
  __validate_model(LGBM_reg,X_val,y_val)
  return LGBM_reg


def __divide_training_data(df):
  X = df.drop(['winPlacePerc'],axis=1)
  y = df['winPlacePerc']
  return train_test_split(X, y, test_size = 0.2, random_state=42)


def __validate_model(model,X_val, y_val):
  pred_val = model.predict(X_val)
  print(mean_absolute_error(y_val, pred_val))

In [14]:
df_train = load_dataset("train_V2.csv")
#print(df_train)
#print(df_train.columns)
df_train = preprocess(df_train)
#print(df_train)
#print(df_train.columns)
model =learn_ann_model(df_train)

#X_test = data_io.load_dataset("test_V2.csv")

0.07119903487502721


기존 아웃라이어까지 + killplace: 0.062273283827051176

