In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook

### 1. Deleting unused variables and gc.collect()

Unlike other languages, Python does not efficiently utilize memory. Variables that we do not use, or that we use or discard, also occupy memory. So we have to keep in mind two things.

1. Unused variables are deleted using del.

2. After del deleting it, it is surely removed from memory through the command gc.collect()

In [None]:
data_df = pd.read_csv('./data/train_V2.csv')

In [None]:
data_df_sample = data_df.copy()
del data_df_sample
gc.collect()

### 2. Presetting the datatypes

Python automatically reads the data type, which causes a lot of memory waste. So if we know in advance the memory we will set up, we can use it much more effectively.

In [None]:
dtypes = {
        'Id'                : 'uint32',
        'groupId'           : 'uint32',
        'matchId'           : 'uint16',
        'assists'           : 'uint8',
        'boosts'            : 'uint8',
        'damageDealt'       : 'float16',
        'DBNOs'             : 'uint8',
        'headshotKills'     : 'uint8', 
        'heals'             : 'uint8',    
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint8',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float16',    
        'maxPlace'          : 'uint8',    
        'numGroups'         : 'uint8',    
        'revives'           : 'uint8',    
        'rideDistance'      : 'float16',    
        'roadKills'         : 'uint8',    
        'swimDistance'      : 'float16',    
        'teamKills'         : 'uint8',    
        'vehicleDestroys'   : 'uint8',    
        'walkDistance'      : 'float16',    
        'weaponsAcquired'   : 'uint8',    
        'winPoints'         : 'uint8', 
        'winPlacePerc'      : 'float16' 
}

In [None]:
# 데이터 타입을 pd.read_csv에서 설정
data_dtypes = pd.read_csv('./data/train_V2.csv', dtype=dtypes)
data_df = pd.read_csv('./data/train_V2.csv') # 파이썬이 알아서 데이터 타입 설정

In [None]:
data_dtypes.info

In [None]:
data_df.info

### If you do not want to do the above, it's a good idea to use kaggler's code.

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
data_df = reduce_mem_usage(data_df)

### 3. Importing selected rows of the a file.(Sampling)
If the size of the data is large as in this competition, you can try sampling. If you check code working well, use selected rows not all rows. ( it is called debug )

In [None]:
train_dtypes = pd.read_csv('./data/train_V2.csv',nrows=10000 , dtype=dtypes)

### randomSampling이 되는지?

### 4. Importing just selected columns
If you want to analyze just some specific feature, you can import just the selected columns.

In [None]:
columns = ['Id', 'groupId', 'matchId','killPlace','killPoints','kills','killStreaks','longestKill','winPlacePerc']

dtypes = {
        'Id'                : 'uint32',
        'groupId'           : 'uint32',
        'matchId'           : 'uint16',   
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint8',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float16',    
        'winPlacePerc'      : 'float16' 
}
example = pd.read_csv('./data/train_V2.csv', usecols=columns, dtype=dtypes)

### 5. Using debug mode
Many people try to make feature engineering and predict pipelines. However, if the size of the data is large, it takes too long to create a variable or training a model. In this case, we can save time and effort by drawing a sample in advance as metioned above.<br>
<br>
로직이나 코드 테스트 할 때는 데이터를 많이 불러와서 실행할 필요 없으므로, 조금만 불러와서 로직 테스트하자!!!<br>
내가 코드 로직 확인할 때는 debug=False로 실행<br>
모델 테스트 할때는 debug=True로 써놓고 실행

In [None]:
debug = True
if debug:
    df_train = pd.read_csv('./data/train_V2.csv',nrows=10000 , dtype=dtypes)
    df_test  = pd.read_csv('./data/train_V2.csv', dtype=dtypes)
else:
    df_train = pd.read_csv('./data/train_V2.csv', dtype=dtypes)
    df_test  = pd.read_csv('./data/train_V2.csv', dtype=dtypes)

### 6. Lightgbm: prevent RAM spike (explode) at the init training
의사결정 트리에서 가장 좋은 성능을 내는 모델<br>
아직 잘 모르겠음....

### LightGBM Faster Training
### https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56158

### Lightgbm: prevent RAM spike (explode) at the init training
### https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773