## Train

- Input:
    - Numerical data with important features
    - Engineered time features
- Output:
    - submission
- Model:
    - XGBoost
    
Time features:
1. Together with test data
    1. ID difference to the previous and next row when sorted by date_start and ID, Time_analysis.ipynb.
    2. ID difference to the previous and next row when sorted by date_end and ID, Time_analysis.ipynb.
    3. ID difference to the previous and next row when sorted by line start time and ID, Time_analysis.ipynb.
    
2. Independent of test data
    4. Duration of whole production binned by 8 or 12 hours, e.g., Time_analysis.ipynb.
    5. The binned day of a week, hour of a week, and hour of a day for date_start and date_end, e.g., Time_analysis.ipynb
    6. Binned date_start, date_end, and duration on each station and line, e.g., station_time.ipynb and line_time.ipynb. 
        - **The bin edges should be saved for test data.**
    7. Station flows converted to a number. 
        - **This is actually a categorical feature, should make XGBoost tree deep.**
    8. Segments of production duration
    
3. Out of fold features
    9. Row distance to the previous error when sorted by ID
    10. Row distance to the previous error when sorted by date_start and ID
    11. Row distance to the previous error when sorted by date_end and ID
    12. The bayesian mean of (E) 
    13. The bayesian mean of (F) 
    14. The bayesian mean of (G)
    

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sys import getsizeof
import time
import gc
import tqdm

%matplotlib inline

In [23]:
import pickle

def save_pickle(x, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def read_pickle(filename):
    with open(filename, 'rb') as handle:
        x = pickle.load(handle)
    return x

### Time features: Together with test data

Together with test data

- ID difference to the previous and next row when sorted by date_start and ID, Time_analysis.ipynb.
- ID difference to the previous and next row when sorted by date_end and ID, Time_analysis.ipynb.
- ID difference to the previous and next row when sorted by line start time and ID, Time_analysis.ipynb.



In [67]:
def calculate_station_time(df, bin_edges=None):
    '''
    Calculate start time, end time, and duration for parts on each station
    
    Output:
    station_time: the start, end, and duration time on each station
    station_time_binned: the binned data
    station_time_bins: contains bin edges to transform new data
    '''
    
    # list of station names
    stations = list(set([f.split('_')[1] for f in df.columns.tolist() if f!='Response']))
    # features in each station
    station_features = {s: [f for f in df.columns.tolist() if s in f] for s in stations }
    
    columns = df.columns.tolist()
    # samples in each feature
    feature_samples = {}
    print('Calculating feature samples:')
    for col in tqdm.tqdm_notebook(columns):
        feature_samples[col] = len(df.loc[df[col].notna(), col])    
    feature_samples = pd.Series(feature_samples)
    feature_samples.sort_values(ascending=False, inplace=True)
    feature_samples = feature_samples.reset_index()
    feature_samples.columns = ['feature', 'count']
    feature_samples['station'] = feature_samples['feature'].apply(lambda x: x.split('_')[1])
    
    # samples per station
    station_samples = feature_samples.groupby('station')['count'].max().sort_values(ascending=False)
    
    # start and end times and durations for each part on each station
    station_time = {}
    for s in tqdm.tqdm_notebook(stations):
        station_time[s+'_start'] = df[station_features[s]].min(axis=1)
        station_time[s+'_end'] = df[station_features[s]].max(axis=1)
        station_time[s+'_duration'] = station_time[s+'_end'] - station_time[s+'_start']
    
    # Read the useful columns, discard the rest
    station_columns = read_pickle('station_time_columns.pickle')
    station_time2 = {}
    for k in station_columns:
        station_time2[k] = station_time[k]
    station_time = station_time2
    del station_time2
    station_time = pd.DataFrame(station_time)
    
    # stores bin edges and labels for the categorical version of station_time
    station_time_bins = {}
    station_time_binned = station_time.copy()
    for f in tqdm.tqdm_notebook(station_columns):
        if not bin_edges:
            # if bins are not provided, use quantile cut
            bins = int(max(10, station_samples[f.split('_')[0]]/20000))
            station_time_binned[f], station_time_bins[f] = pd.qcut(station_time[f], retbins=True,
                q=bins, labels=False, duplicates='drop')
        else:
            # if bin edges are provided, use cut
            station_time_binned[f], station_time_bins[f] = pd.cut(station_time[f], retbins=True,
                bins=bin_edges[f], labels=False, duplicates='drop')
    
    return station_time, station_time_binned, station_time_bins

# Test
#df = pd.read_csv('data/train_date.csv.zip', index_col=0, nrows=1000)
#station_time, station_time_binned, station_time_bins = calculate_station_time(df)
#df = pd.read_csv('data/test_date.csv.zip', index_col=0, nrows=1000)
#station_time, station_time_binned, station_time_bins = calculate_station_time(df, bin_edges=station_time_bins)


In [4]:
# Load date data
x_train_date = pd.read_csv('data/train_date.csv.zip', index_col=0, dtype=np.float16)
x_test_date = pd.read_csv('data/test_date.csv.zip', index_col=0, dtype=np.float16)
print('Memory usage of x_date is {:.3f}G.'.format(x_train_date.memory_usage(deep=True).sum()*10**-9))
print('Memory usage of x_date is {:.3f}G.'.format(x_test_date.memory_usage(deep=True).sum()*10**-9))

  mask |= (ar1 == a)


Memory usage of x_date is 2.746G.
Memory usage of x_date is 2.746G.


In [5]:
# Concatenate train and test data together
x_all_date = pd.concat([x_train_date, x_test_date])

In [6]:
# Start and end time of parts
# first create a series, then add it to the dataframe, this is more memory efficient
date_start = x_all_date.min(axis=1).values
x_all_date['date_start'] = date_start
date_end = x_all_date.max(axis=1).values
x_all_date['date_end'] = date_end

del date_start, date_end
gc.collect()

7

The ID distance to the previous and next row when sorted by ID 

In [7]:
x_all_date.head()

Unnamed: 0_level_0,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,L0_S0_D19,...,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263,date_start,date_end
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4.0,82.25,82.25,82.25,82.25,82.25,82.25,82.25,82.25,82.25,82.25,...,,,,,,,,,82.25,87.3125
6.0,,,,,,,,,,,...,,,,,,,,,1313.0,1316.0
7.0,1619.0,1619.0,1619.0,1619.0,1619.0,1619.0,1619.0,1619.0,1619.0,1619.0,...,,,,,,,,,1619.0,1624.0
9.0,1149.0,1149.0,1149.0,1149.0,1149.0,1149.0,1149.0,1149.0,1149.0,1149.0,...,,,,,,,,,1149.0,1154.0
11.0,602.5,602.5,602.5,602.5,602.5,602.5,602.5,602.5,602.5,602.5,...,,,,,,,,,602.5,606.0


In [8]:
# Extract ID information
x_all_date.reset_index(inplace=True)

In [None]:
# ID difference for only train set when sorted by ID
diff1 = x_all_date['Id'].diff().fillna(9999999)
x_all_date['diff1'] = x_all_date['Id'].diff().fillna(9999999)
diff2 = x_all_date['Id'].diff(-1).fillna(9999999)
x_all_date['diff2'] = diff2

# ID difference for both train and test when sorted by time and ID
x_all_date.sort_values(by=['date_start', 'Id'], inplace=True)
diff3 = x_all_date['Id'].diff().fillna(9999999)
x_all_date['diff3'] = diff3
diff4 = x_all_date['Id'].diff(-1).fillna(9999999)
x_all_date['diff4'] = diff4

In [5]:
# Load numerical data
x_train_num = pd.read_csv('data/train_numeric.csv.zip', index_col=0, dtype=np.float16)
y_train = x_train_num['Response']
y_train = y_train.astype(int)
x_train_num.drop('Response', axis=1, inplace=True)

n_train, n_num = x_train_num.shape

# Sample numerical data, obtain feature importance
idx = np.random.randint(0, n_train, 200000)

x_sample = x_train_num.iloc[idx].values
y_sample = y_train.iloc[idx].values

# Train XGBoost
clf = XGBClassifier(base_score=0.0058, max_depth=6, n_jobs=6)
clf.fit(x_sample, y_sample, verbose=True)

# Plot feature importance
important_indices = np.where(clf.feature_importances_>0.001)[0]
plt.plot(sorted(clf.feature_importances_))
plt.plot(0.001*np.ones(len(clf.feature_importances_)))

# We have selected 242 important features
important_numerical_features = x_train_num.columns[important_indices]
print(important_numerical_features)
print(len(important_numerical_features))

# Save names of the important features.
important_numerical_features = pd.DataFrame(important_numerical_features)
important_numerical_features.to_csv('important_numerical_features_samples_2e5.csv')

# Remove other features for the time being to save memory.
x_train_num.drop([c for c in x_train_num.columns.values if c not in important_numerical_features.values.reshape(-1,)], 
                axis=1, inplace=True)

print('Memory usage of x_num is {:.3f}G.'.format(x_train_num.memory_usage(deep=True).sum()*10**-9))