In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import tqdm
import gc

%matplotlib inline

In [2]:
# If RAM is limited, set total_rows = 10000 or 100000; otherwise, set total_rows = None
total_rows = None
date_train = pd.read_csv('data/train_date.csv.zip', index_col=0, nrows=total_rows)

# Drop rows and columns if they are redundant
# date_train.dropna(axis=0, how='all', inplace=True)
date_train.dropna(axis=1, how='all', inplace=True)

# Load labels
response = pd.read_csv('data/train_numeric.csv.zip', index_col=0, 
                       usecols=[0, 969], nrows=total_rows)

  mask |= (ar1 == a)


In [3]:
date_test = pd.read_csv('data/test_date.csv.zip', index_col=0, nrows=total_rows)

  mask |= (ar1 == a)


In [4]:
date = pd.concat([date_train, date_test], keys=['train', 'test'])

In [5]:
date.sort_values('Id').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
Unnamed: 0_level_1,Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
test,1,,,,,,,,,,,...,,,,,,,,,,
test,2,,,,,,,,,,,...,,,,,,,,,,
test,3,,,,,,,,,,,...,,,,,,,,,,
train,4,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,...,,,,,,,,,,
test,5,255.45,255.45,255.45,255.45,255.45,255.45,255.45,255.45,255.45,255.45,...,,,,,,,,,,


In [6]:
# date_all.to_csv("data/date_all.csv.gz", compression="gzip")

In [7]:
date.shape

(2367495, 1156)

In [8]:
date_train.shape

(1183747, 1156)

Restructure columns

Each column of date records a time stamp in the production line and tracks flows of parts. The columns follow a name convention of Line_Station_Feature.

The structure of production line, station, and feature is hierarchical, i.e., there is no feature belongs to two stations, and no station belongs to two lines. So we can restructure the columns.


In [9]:
# Extract station and feature names, and build a new pandas MultiIndex object
new_columns = pd.MultiIndex.from_tuples([tuple([int(a[1:]) 
                                          for a in x[3:].split('_')])
                                          for x in date.columns], 
                                          names=['station', 'feature'])
date.columns = new_columns

In [10]:
# Extract station names
stations = sorted([x for x in date.columns.levels[0]])

In [11]:
# Create station flow as strings
# date_station = date.groupby(level=0, axis=1).sum().apply(
#     lambda x: (x>0).astype(int).astype(str), raw=True).apply(
#     lambda x: ''.join(x.values), axis=1)
# date_station = pd.DataFrame(date_station).join(response)
# date_station.columns = ['station_flow', 'Response']

In [13]:
date_station_hash = date.groupby(level=0, axis=1).sum().apply(
    lambda x: (x>0).astype(int).astype(str), raw=True).apply(
    lambda x: ''.join(x.values), axis=1).apply(
    lambda x: hash(x)%2**26)

In [14]:
date_station_hash['train'].to_csv('train_station_flow.csv.gz', compression='gzip')
date_station_hash['test'].to_csv('test_station_flow.csv.gz', compression='gzip')

In [15]:
date_station_hash['train'].shape

(1183747,)

In [16]:
date_station_hash['train'].iloc[369:375]

Id
724    14388745
725    33156445
726    53919033
727    32738570
729    42042335
730    61621967
dtype: int64