In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import tqdm

%matplotlib inline

In [2]:
# If RAM is limited, set total_rows = 10000 or 100000; otherwise, set total_rows = None
total_rows = None
date = pd.read_csv('data/train_date.csv.zip', index_col=0, nrows=total_rows)

# Drop rows and columns if they are redundant
date.dropna(axis=0, how='all', inplace=True)
date.dropna(axis=1, how='all', inplace=True)

# Load labels
response = pd.read_csv('data/train_numeric.csv.zip', index_col=0, 
                       usecols=[0, 969], nrows=total_rows)

  mask |= (ar1 == a)


Restructure columns

Each column of date records a time stamp in the production line and tracks flows of parts. The columns follow a name convention of Line_Station_Feature.

The structure of production line, station, and feature is hierarchical, i.e., there is no feature belongs to two stations, and no station belongs to two lines. So we can restructure the columns.


In [3]:
# Extract station and feature names, and build a new pandas MultiIndex object
new_columns = pd.MultiIndex.from_tuples([tuple([int(a[1:]) 
                                          for a in x[3:].split('_')])
                                          for x in date.columns], 
                                          names=['station', 'feature'])
date.columns = new_columns

In [4]:
# Extract station names
stations = sorted([x for x in date.columns.levels[0]])

In [141]:
date_station = date.groupby(level=0, axis=1).sum().apply(lambda x: (x>0).astype(int).astype(str), raw=True).apply(lambda x: ''.join(x.values))

In [144]:
date_station = date_station.join(response)



In [145]:
date_station

Unnamed: 0_level_0,"(0, 1)","(0, 3)","(0, 5)","(0, 7)","(0, 9)","(0, 11)","(0, 13)","(0, 15)","(0, 17)","(0, 19)",...,"(50, 4250)","(50, 4252)","(50, 4254)","(51, 4255)","(51, 4257)","(51, 4259)","(51, 4261)","(51, 4263)","(flow, )",Response
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,...,,,,,,,,,1001110010101101000100100001000000110010100100...,0
6,,,,,,,,,,,...,,,,,,,,,0010000000010110000100010010001000000000001110...,0
7,1618.70,1618.70,1618.70,1618.70,1618.70,1618.70,1618.70,1618.70,1618.70,1618.70,...,,,,,,,,,1001110010101001010000100101000010110110110000...,0
9,1149.20,1149.20,1149.20,1149.20,1149.20,1149.20,1149.20,1149.20,1149.20,1149.20,...,,,,,,,,,0000000000011111000000100000000000000110110010...,0
11,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,...,,,,,,,,,1000100000000000000000000001001010110000001100...,0
13,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,1331.66,...,,,,,,,,,0100001001000000101001001000110101000001000001...,0
14,,,,,,,,,,,...,,,,,,,,,0100000001000000100000000000100101000001000001...,0
16,,,,,,,,,,,...,,,,,,,,,0100000000000000001001001000000001000000000001...,0
18,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,517.64,...,,,,,,,,,0100000001000000001000000000110001000000000000...,0
23,,,,,,,,,,,...,,,,,,,,,0000000000000000101001000000000000000000000001...,0
