In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import gc
from tqdm import tqdm


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
path = 'E:/'
num_max = 80
cut = num_max * 4 + 1

# train_jet = pd.read_feather(path+'jet_complex_data/complex_train_R04_jet.feather')

train_jet = pd.read_csv(path+'jet_complex_data/complex_train_R04_jet.csv')

In [3]:
label_map = dict(zip(np.sort(train_jet['label'].unique()), range(len(train_jet['label'].unique()))))
train_jet['label'] = train_jet['label'].map(label_map)

In [4]:
def reduce_mem(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32) #if not to_feather: np.float16
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
def trans_data(df):
    df_out = df.set_index(['jet_id',df.groupby('jet_id').cumcount()+1]).unstack().sort_index(level=1, axis=1)
    df_out.columns = df_out.columns.map('{0[0]}_{0[1]}'.format)
    return(df_out.reset_index())

In [6]:
drop_list = ['particle_category', 'particle_mass']

In [7]:
train_particle = pd.read_feather(path+'jet_complex_data/complex_train_R04_particle.feather')
train_particle = train_particle.drop(drop_list, axis=1)
train_trans = trans_data(train_particle)
train_trans = train_trans[train_trans.columns.to_list()[:cut]].fillna(0)
train_trans = pd.merge(train_trans, train_jet[['jet_id', 'label']], on='jet_id', how='left')
del train_particle
# train_trans = reduce_mem(train_trans)
train_trans.info(null_counts=True, verbose=True)
train_trans.to_feather(path+'jet_complex_data/train_trans.feather')
del train_trans

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1134555 entries, 0 to 1134554
Data columns (total 322 columns):
jet_id                1134555 non-null object
particle_energy_1     1134555 non-null float32
particle_px_1         1134555 non-null float32
particle_py_1         1134555 non-null float32
particle_pz_1         1134555 non-null float32
particle_energy_2     1134555 non-null float32
particle_px_2         1134555 non-null float32
particle_py_2         1134555 non-null float32
particle_pz_2         1134555 non-null float32
particle_energy_3     1134555 non-null float32
particle_px_3         1134555 non-null float32
particle_py_3         1134555 non-null float32
particle_pz_3         1134555 non-null float32
particle_energy_4     1134555 non-null float32
particle_px_4         1134555 non-null float32
particle_py_4         1134555 non-null float32
particle_pz_4         1134555 non-null float32
particle_energy_5     1134555 non-null float32
particle_px_5         1134555 non-null fl

In [8]:
test_particle = pd.read_feather(path+'jet_complex_data/complex_test_R04_particle.feather')
test_particle = test_particle.drop(drop_list, axis=1)
test_trans = trans_data(test_particle)
test_trans = test_trans[test_trans.columns.to_list()[:cut]].fillna(0)
test_trans['label'] = 0
del test_particle
# test_trans = reduce_mem(test_trans)
test_trans.info(null_counts=True, verbose=True)
test_trans.to_feather(path+'jet_complex_data/test_trans.feather')
del test_trans

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537949 entries, 0 to 537948
Data columns (total 322 columns):
jet_id                537949 non-null object
particle_energy_1     537949 non-null float32
particle_px_1         537949 non-null float32
particle_py_1         537949 non-null float32
particle_pz_1         537949 non-null float32
particle_energy_2     537949 non-null float32
particle_px_2         537949 non-null float32
particle_py_2         537949 non-null float32
particle_pz_2         537949 non-null float32
particle_energy_3     537949 non-null float32
particle_px_3         537949 non-null float32
particle_py_3         537949 non-null float32
particle_pz_3         537949 non-null float32
particle_energy_4     537949 non-null float32
particle_px_4         537949 non-null float32
particle_py_4         537949 non-null float32
particle_pz_4         537949 non-null float32
particle_energy_5     537949 non-null float32
particle_px_5         537949 non-null float32
particle_py_5  