In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression as linear
from sklearn.linear_model import Ridge as ridge
from sklearn.linear_model import Lasso as lasso
from lightgbm import LGBMRegressor as lgbm
from lightgbm import plot_importance
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
def reduce_ram_usage(df) :
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('\nMemory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

#결측치 출력
def checkNaN(df):
    print("Missing Value List")
    for col in df.columns:
            if df[col].isnull().sum():
                print(f"{col} : {df[col].isnull().sum()} ")
#결측치 제거
def dropNaN(df):
    print("Pre-Processing...")
    for i in df.columns.to_list() :
        dpIdx = df[df[i].isnull()==True].index
        df.drop(index=dpIdx, inplace=True)
    print(f"{dpIdx} Columns Dropped.")    
    return df

def dropOutlier (df):
    #입력받은 인덱스를 제거하는 함수입니다(라인 축약)
    def dropIdx(df, idx) :
        df.drop(index=idx, inplace=True)
        dropIdx.dpIdx_sum +=len(idx)
        return df

    dropIdx.dpIdx_sum = 0

    print("Pre-Processing...")
    for i in df.columns.to_list() :
        df.drop(index=df[df[i].isnull()==True].index, inplace=True)
        
    print("Droping Outliers...")
    vip_features = ["assists","boosts","DBNOs","heals","kills","killStreaks","walkDistance", "revives", "roadKills", "vehicleDestroys"]
    
    #한 그룹 내에 너무 많은 인원이 있는 경우 (이하 제거).
    group = df.groupby('groupId').count()
    df = dropIdx(df, df[df.groupId.isin(group[group["Id"]>group["Id"].quantile(0.9999)].index)==True].index) 
    
    #수치형 데이터에서 0.1%의 극값
    for col in (vip_features + ["damageDealt","longestKill", "rideDistance", "swimDistance","weaponsAcquired", "matchDuration"]):
        df = dropIdx(df, df[df[col]>df[col].quantile(0.99999)].index)
    
    #걸은 거리보다 많은 킬/아이템 사용 등이 있는 경우
    for col in vip_features:
        df = dropIdx(df, df[df["walkDistance"]<df[col]].index)
    
    #한 게임의 플레이어보다 많은 처치를 기록한 경우
    df = dropIdx(df, df[df.groupby('matchId')['kills'].transform('max')  > df.groupby('matchId')['Id'].transform('count')  ].index)
    #차를 타지 않고 로드킬을 올린 경우
    df = dropIdx(df, df[(df['rideDistance']==0) & (df['roadKills']>0)  ].index)

    #한 서버에 한 팀만 있는 경우, 최대 등수를 조정
    df.loc[(df.maxPlace>1)&(df.numGroups==1), "maxPlace"] = 1

    print(f"{dropIdx.dpIdx_sum} Columns has deleted!") 

    del vip_features, group      
    gc.collect()
    
    return df

def encodeMatch (df):
    print("Encoding matchType...")

    mapper = lambda x: 'normal' if ('normal' in x) or ('crash' in x)or ('flare' in x)else x 
    df["matchType"]=df["matchType"].apply(mapper)

    mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) else 'normal' if ('normal' in x) else 'squad' 
    df["matchType"]=df["matchType"].apply(mapper)

    df = pd.concat([df,pd.get_dummies(df["matchType"])], axis=1)

    del mapper
    gc.collect()
    return df

def makeCols (df) :
    print("Making columns...")
    df["killPlace"] = df.groupby("matchId")["kills"].transform('rank', ascending=False)
    #data leakage 없는 killPlace data

    stat_feature = ["assists",
                    "boosts",
                    "DBNOs",
                    "heals",
                    "kills",
                    "killStreaks",
                    "walkDistance", 
                    "revives", 
                    "roadKills", 
                    "vehicleDestroys",
                    "longestKill", 
                    "rideDistance", 
                    "swimDistance",
                    "weaponsAcquired"]
    stat_list = ["max","mean","median","min"]
    for col in stat_feature :
        for stat in stat_list:
            df = pd.concat([df,df.groupby("groupId")[col].transform(stat).rename(f"{col}_{stat}")], axis=1) 
            df = pd.concat([df,df.groupby("matchId")[f"{col}_{stat}"].transform('rank', ascending=False).rename(f"{col}_{stat}Place")], axis=1)
    #group별 column stats, match별 group stats 순위
    
    print(len(stat_feature)*len(stat_list)+1, f"columns Made! Now {len(df.columns)} column in DF.")
    df = reduce_ram_usage(df)
    return df

def LGBM_training_mae(target, features):
    model = LGBMRegressor().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae

def main() :
    print("Data loading...")
    train = pd.read_csv("../input/pubg-finish-placement-prediction/train_V2.csv")
    test = pd.read_csv("../input/pubg-finish-placement-prediction/test_V2.csv")
    print("Data loaded!")
    
    # pre-processing train
    checkNaN(train)
    train = dropNaN(train)
    train = dropOutlier(train)
    train = encodeMatch(train)
    train = makeCols(train)
    train = train.drop(["Id","groupId", "matchType","matchId", "numGroups","damageDealt"], axis=1) 
    
    # pre-processing test
    checkNaN(test)
    test = dropNaN(test)
    test = dropOutlier(test)
    test = encodeMatch(test)
    test = makeCols(test)
    test = test.drop(["Id","groupId", "matchType","matchId", "numGroups","damageDealt"], axis=1) 
    
    X = train.drop(["winPlacePerc"], axis=1)
    y = train['winPlacePerc']
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0xC0FFEE)
    #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0xC0FFEE)
    model = LGBMRegressor()
    model.fit(X, y)
    
    result = model.predict(test)
    
    sample_submission = pd.read_csv("../input/pubg-finish-placement-prediction/sample_submission_V2.csv",index_col = "Id")
    sample_submission["winPlacePerc"] = result
    sample_submission.to_csv("submission.csv")
    
if __name__=="__main__" :
    main()
