In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import gc
from tqdm import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
path = 'D:/data/molecule_open_data/'
train = pd.read_csv(path+'candidate_train.csv')
test = pd.read_csv(path+'candidate_val.csv')
label = pd.read_csv(path+'train_answer.csv')

In [9]:
train.info()
test.info()
label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79267 entries, 0 to 79266
Columns: 3178 entries, 0 to id
dtypes: float64(21), int64(3156), object(1)
memory usage: 1.9+ GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26358 entries, 0 to 26357
Columns: 3178 entries, 0 to id
dtypes: float64(21), int64(3156), object(1)
memory usage: 639.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79267 entries, 0 to 79266
Data columns (total 7 columns):
id    79267 non-null object
p1    79267 non-null float64
p2    79267 non-null float64
p3    79267 non-null float64
p4    79267 non-null float64
p5    79267 non-null float64
p6    79267 non-null float64
dtypes: float64(6), object(1)
memory usage: 4.2+ MB


In [7]:
drop_feat = []
for col in train.columns:
    if train[col].nunique() == 1:
        drop_feat.append(col)
len(drop_feat)
print(drop_feat)

51

['0', '1', '2', '3', '4', '5', '19', '21', '26', '32', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78']


In [8]:
used_feat = [f for f in train.columns if f not in drop_feat]
print(len(used_feat))

3127


In [9]:
train = train[used_feat]
test = test[used_feat]
gc.collect()

28624

In [10]:
train = pd.merge(train, label, on='id', how='left')

In [11]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32) #if not to_feather: np.float16
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [12]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
gc.collect()

Mem. usage decreased to 244.10 Mb (87.1% reduction)
Mem. usage decreased to 80.36 Mb (87.2% reduction)


0

In [13]:
train.to_feather(path+'train.feather')
test.to_feather(path+'test.feather')