In [1]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
from sklearn import preprocessing
from scipy.stats import entropy, kurtosis

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth',100)
pd.set_option('display.width',100)
pd.set_option('display.max_info_rows',100)

In [2]:
path = 'D:/data/'

In [3]:
def reduce_mem(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32) #if not to_feather: np.float16
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
train = pd.read_csv(path+'jet_complex_data/jet_complex_data/complex_train_R04_particle.csv')
train = reduce_mem(train)
train.to_feather(path+'jet_complex_data/complex_train_R04_particle.feather')
del train
gc.collect()

Mem. usage decreased to 695.15 Mb (46.4% reduction)


7

In [8]:
test = pd.read_csv(path+'jet_complex_data/jet_complex_data/complex_test_R04_particle.csv')
test = reduce_mem(test)
test.to_feather(path+'jet_complex_data/complex_test_R04_particle.feather')
del test
gc.collect()

In [6]:
train = pd.read_csv(path+'jet_complex_data/jet_complex_data/complex_train_R04_jet.csv')
train = reduce_mem(train)
train.to_feather(path+'jet_complex_data/complex_train_R04_jet.feather')
del train
gc.collect()

Mem. usage decreased to 41.12 Mb (47.2% reduction)


20

In [7]:
test = pd.read_csv(path+'jet_complex_data/jet_complex_data/complex_test_R04_jet.csv')
test = reduce_mem(test)
test.to_feather(path+'jet_complex_data/complex_test_R04_jet.feather')
del test
gc.collect()

Mem. usage decreased to 18.98 Mb (42.2% reduction)


20