In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import tqdm
import gc

%matplotlib inline

In [2]:
# If RAM is limited, set total_rows = 10000 or 100000; otherwise, set total_rows = None
total_rows = None
date_train = pd.read_csv('data/train_date.csv.zip', index_col=0, nrows=total_rows)

# Drop rows and columns if they are redundant
# date_train.dropna(axis=0, how='all', inplace=True)
date_train.dropna(axis=1, how='all', inplace=True)

# Load labels
response = pd.read_csv('data/train_numeric.csv.zip', index_col=0, 
                       usecols=[0, 969], nrows=total_rows)

  mask |= (ar1 == a)


In [3]:
date_test = pd.read_csv('data/test_date.csv.zip', index_col=0, nrows=total_rows)

  mask |= (ar1 == a)


In [4]:
date = pd.concat([date_train, date_test], keys=['train', 'test'])

In [5]:
date.sort_values('Id').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
Unnamed: 0_level_1,Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
test,1,,,,,,,,,,,...,,,,,,,,,,
test,2,,,,,,,,,,,...,,,,,,,,,,
test,3,,,,,,,,,,,...,,,,,,,,,,
train,4,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,...,,,,,,,,,,
test,5,255.45,255.45,255.45,255.45,255.45,255.45,255.45,255.45,255.45,255.45,...,,,,,,,,,,


In [6]:
# date_all.to_csv("data/date_all.csv.gz", compression="gzip")

In [7]:
date.shape

(2367495, 1156)

In [8]:
date_train.shape

(1183747, 1156)

Restructure columns

Each column of date records a time stamp in the production line and tracks flows of parts. The columns follow a name convention of Line_Station_Feature.

The structure of production line, station, and feature is hierarchical, i.e., there is no feature belongs to two stations, and no station belongs to two lines. So we can restructure the columns.


In [9]:
# Extract station and feature names, and build a new pandas MultiIndex object
new_columns = pd.MultiIndex.from_tuples([tuple([int(a[1:]) 
                                          for a in x[3:].split('_')])
                                          for x in date.columns], 
                                          names=['station', 'feature'])
date.columns = new_columns

In [10]:
# Extract station names
stations = sorted([x for x in date.columns.levels[0]])

In [11]:
# Create station flow as strings
# date_station = date.groupby(level=0, axis=1).sum().apply(
#     lambda x: (x>0).astype(int).astype(str), raw=True).apply(
#     lambda x: ''.join(x.values), axis=1)
# date_station = pd.DataFrame(date_station).join(response)
# date_station.columns = ['station_flow', 'Response']

In [16]:
date.head()

Unnamed: 0_level_0,station,0,0,0,0,0,0,0,0,0,0,...,50,50,50,50,50,51,51,51,51,51
Unnamed: 0_level_1,feature,1,3,5,7,9,11,13,15,17,19,...,4246,4248,4250,4252,4254,4255,4257,4259,4261,4263
Unnamed: 0_level_2,Id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
train,4,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,...,,,,,,,,,,
train,6,,,,,,,,,,,...,,,,,,,,,,
train,7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,...,,,,,,,,,,
train,9,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,...,,,,,,,,,,
train,11,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,...,,,,,,,,,,


In [15]:
a = date.min(axis=1, level=0)

In [22]:
a.head()

Unnamed: 0_level_0,station,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
Unnamed: 0_level_1,Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
train,4,82.24,82.24,82.24,,82.26,,,82.26,82.27,,...,,,,,,,,,,
train,6,,,,,,,,,,,...,,,,,,,,,,
train,7,1618.7,1618.7,1618.7,,,1618.72,1618.72,,1618.73,,...,,,,,,,,,,
train,9,1149.2,1149.2,1149.21,,1149.22,,,1149.22,1149.22,,...,,,,,,,,,,
train,11,602.64,602.64,,602.64,602.66,,,602.67,602.67,,...,,,,,,,,,,


In [23]:
b = a.fillna(0).astype(bool)

In [30]:
c = b.apply(lambda x: hash(tuple(x.values)), axis=1)

In [31]:
c

       Id     
train  4          -486890664552569235
       6         -4967969351282693796
       7          2003576129710373782
       9          2588167681413458674
       11        -5878613314420641950
       13         4278220292568782734
       14        -7957126925861168414
       16         5423229247760839957
       18         2588167681413458674
       23          403473567243099458
       26         2588167681413458674
       27        -7192200703032777600
       28        -5890130387930503760
       31          322684002963325913
       34        -8882253314188101660
       38         1284449656069353514
       41         9093101336628574256
       44        -8399627029869753458
       47        -7243726917410288786
       49        -3765320621560641642
       52           29858755121874359
       55        -7502289153662673564
       56         5170259863918461319
       57         2003576129710373782
       63        -4463937189231780334
       68        -67095744187144293

In [13]:
#date_station_hash = date.groupby(level=0, axis=1).sum().apply(
#    lambda x: (x>0).astype(int).astype(str), raw=True).apply(
#    lambda x: ''.join(x.values), axis=1).apply(
#    lambda x: hash(x)%2**26)

In [33]:
c['train'].to_csv('benchmark_features/benchmark_2/train_station_flow.csv.gz', compression='gzip')
c['test'].to_csv('benchmark_features/benchmark_2/test_station_flow.csv.gz', compression='gzip')

In [34]:
c['train'].shape

(1183747,)

In [35]:
c['train'].iloc[369:375]

Id
724    2155058236440751936
725    8935856773699965683
726    2974502267224677574
727   -4463937189231780334
729    3693628740865697842
730   -3441403407394341377
dtype: int64

In [16]:
c['train'].iloc[369:375]

Id
724    14388745
725    33156445
726    53919033
727    32738570
729    42042335
730    61621967
dtype: int64