In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

pd.options.display.max_columns = 50

import random
from  datetime import datetime, timedelta

import lightgbm as lgb

In [2]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32", 
                "rm_diff_price_4":"float32", "rm_diff_price_12":"float32","rm_diff_price_50":"float32" }
PROC_PRICES_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16", 
                        "rm_diff_price_4":"float32", "rm_diff_price_12":"float32","rm_diff_price_50":"float32" }

In [3]:
h = 28 
max_lags = 70
tr_last = 1941
fday = datetime(2016,5, 23) 
fday

datetime.datetime(2016, 5, 23, 0, 0)

In [4]:
def create_dt(is_train = True, nrows = None, first_day = 1200, dept='HOBBIES_1'):
    prices = pd.read_csv("./raw_data/sell_prices.csv", dtype = PRICE_DTYPES)
    proc_price = pd.read_csv('./proc_data/prices_processed.csv', dtype = PROC_PRICES_DTYPES).drop('sell_price', axis=1)
    prices = prices.merge(proc_price, on=['store_id','item_id','wm_yr_wk'], how='left')
    del(proc_price)
    
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("./raw_data/calendar.csv", dtype = CAL_DTYPES)
    proc_cal = pd.read_csv('./proc_data/processed_calendar.csv').drop(
    ['wm_yr_wk','wday','month','year','snap_CA','snap_TX','snap_WI'], axis=1).rename(columns={'day':'d'})
    cols_events_days = ['d_{}'.format(c) for c in list(np.arange(1910,1990))]
    ev1 = cal[cal['d'].isin(cols_events_days)]['event_name_1'].unique().tolist()
    ev2 = cal[cal['d'].isin(cols_events_days)]['event_name_2'].unique().tolist()
    evs = list(set(ev1+ev2))
    for c in list(set(proc_cal.columns.tolist()) - set(['d'])):
        proc_cal[c] = proc_cal[c].astype(int)
    cal = cal.merge(proc_cal, on='d', how='left')
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    events_to_mantain = ['event_name_1_{}'.format(c) for c in evs]+['event_name_2_{}'.format(c) for c in evs]
    events_to_mantain_ = [c for c in cal.columns if c in events_to_mantain]
    cal = cal[['date','wm_yr_wk','weekday','wday','month','year','d','event_name_1','event_type_1','event_name_2',
               'event_type_2','snap_CA','snap_TX','snap_WI','group_day']+events_to_mantain_]
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("./raw_data/sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    dt = dt[dt['dept_id']==dept]
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [5]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [6]:
FIRST_DAY = 350

In [7]:
%%time

sub_p_total = pd.DataFrame()
for dept in ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']:
#for dept in ['HOBBIES_1']:

    df = create_dt(is_train=True, first_day= FIRST_DAY, dept=dept)
    print(df.shape)

    create_fea(df)
    print(df.shape)
    print(df.columns)

    for c in [c for c in df.columns.tolist() if 'rm_diff_price_' in c]:
        df[c].fillna(0, inplace=True)
    
    #df.dropna(inplace = True)
    df.shape

    cat_feats = (['item_id','store_id', 'cat_id', 'state_id'] 
                 + ["event_type_1", "event_type_2"]
                 + ['wday', 'month', 'snap_CA', 'snap_TX', 'snap_WI'])
    
    useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "dept_id", "sell_price",'event_name_1', 'event_name_2']
    train_cols = df.columns[~df.columns.isin(useless_cols)]

    #days_val = df['d'].unique().tolist()[-200:]
    days_val = random.choices(df['d'].unique().tolist(), k=500)
    X_train = df[df['d'].isin(days_val)==False][train_cols]
    y_train = df[df['d'].isin(days_val)==False]["sales"]
    X_val = df[df['d'].isin(days_val)==True][train_cols]
    y_val = df[df['d'].isin(days_val)==True]["sales"]

    train_data = lgb.Dataset(X_train, label = y_train, categorical_feature=cat_feats)
    valid_data = lgb.Dataset(X_val, label = y_val, categorical_feature=cat_feats)

    params = {
            "objective" : "poisson",
            "metric" :"poisson",
            "learning_rate" : 0.09,
            "sub_feature" : 0.9,
            "sub_row" : 0.75,
            "bagging_freq" : 1,
            "lambda_l2" : 0.1,
            'verbosity': 1,
            'num_iterations' : 2000,
            'num_leaves': 32,
            "min_data_in_leaf": 50,
    }

    m_lgb = lgb.train(params, train_data, valid_sets = [train_data, valid_data], 
                      verbose_eval=20, early_stopping_rounds=30) 
    
    feature_imp = pd.DataFrame({'Value':m_lgb.feature_importance(),'Feature':X_train.columns})
    feature_imp = feature_imp.sort_values(by='Value', ascending=False).reset_index(drop=True)
    
    display(feature_imp.head(20))

    te = create_dt(False, dept=dept)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = m_lgb.predict(tst)
        del(tst)

    del(m_lgb)
    sub_p = pd.pivot_table(te, index='id', values='sales', columns='d').iloc[:,-28:].reset_index()
    del(te)
    sub_p_total = pd.concat([sub_p_total, sub_p])
    del(sub_p)

(5869396, 39)
(5869396, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'event_name_1_NBAFinalsEnd', 'event_name_1_NBAFinalsStart',
       'event_name_1_OrthodoxEaster', 'event_name_1_Pesach End',
       'event_name_1_Ramadan starts', 'event_name_2_Cinco De Mayo',
       'event_name_2_Father's day', 'event_name_2_OrthodoxEaster',
       'sell_price', 'rm_diff_price_1', 'rm_diff_price_2', 'rm_diff_price_4',
       'rm_diff_price_8', 'lag_7', 'lag_28', 'rmean_7_7', 'rmean_28_7',
       'rmean_7_28', 'rmean_28_28', 'week', 'quarter', 'mday'],
      dtype='object')




Training until validation scores don't improve for 30 rounds
[20]	training's poisson: 0.45416	valid_1's poisson: 0.460645
[40]	training's poisson: 0.378036	valid_1's poisson: 0.386472
[60]	training's poisson: 0.357185	valid_1's poisson: 0.366982
[80]	training's poisson: 0.349635	valid_1's poisson: 0.360501
[100]	training's poisson: 0.345468	valid_1's poisson: 0.357264
[120]	training's poisson: 0.342798	valid_1's poisson: 0.355558
[140]	training's poisson: 0.340673	valid_1's poisson: 0.354244
[160]	training's poisson: 0.338926	valid_1's poisson: 0.353234
[180]	training's poisson: 0.337443	valid_1's poisson: 0.352394
[200]	training's poisson: 0.336045	valid_1's poisson: 0.351785
[220]	training's poisson: 0.334884	valid_1's poisson: 0.351253
[240]	training's poisson: 0.333809	valid_1's poisson: 0.350721
[260]	training's poisson: 0.332834	valid_1's poisson: 0.350365
[280]	training's poisson: 0.331944	valid_1's poisson: 0.349994
[300]	training's poisson: 0.331112	valid_1's poisson: 0.349747

Unnamed: 0,Value,Feature
0,8622,item_id
1,2352,rmean_28_28
2,2126,rmean_7_28
3,2107,store_id
4,2017,month
5,1933,rmean_7_7
6,1754,rmean_28_7
7,1559,wday
8,1321,lag_7
9,1222,lag_28


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
(2125531, 39)
(2125531, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'even

Unnamed: 0,Value,Feature
0,11475,item_id
1,3623,rmean_28_28
2,3496,store_id
3,3418,rmean_7_28
4,3043,month
5,2210,rmean_7_7
6,2163,mday
7,2104,wday
8,1896,week
9,1877,rmean_28_7


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
(7391892, 39)
(7391892, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'even

Unnamed: 0,Value,Feature
0,23224,item_id
1,4412,month
2,4021,rmean_7_28
3,3986,store_id
4,3550,rmean_28_28
5,3439,rmean_7_7
6,2452,week
7,2401,rmean_28_7
8,2224,mday
9,1936,year


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
(7224773, 39)
(7224773, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'even

Unnamed: 0,Value,Feature
0,25172,item_id
1,4291,store_id
2,4276,month
3,4262,rmean_7_28
4,3382,rmean_28_28
5,2473,rmean_7_7
6,2237,week
7,1941,mday
8,1903,wday
9,1746,year


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
(3025995, 39)
(3025995, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'even

Unnamed: 0,Value,Feature
0,16364,item_id
1,4282,store_id
2,4066,month
3,3848,rmean_28_28
4,3842,rmean_7_28
5,3470,rmean_7_7
6,2513,week
7,2375,rmean_28_7
8,2088,mday
9,1708,wday


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
(5556345, 39)
(5556345, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'even

Unnamed: 0,Value,Feature
0,21670,item_id
1,4779,month
2,4203,store_id
3,4010,rmean_7_28
4,3404,rmean_28_28
5,3372,rmean_7_7
6,2732,week
7,2427,mday
8,2366,rmean_28_7
9,2349,year


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
(11324376, 39)
(11324376, 48)
Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'group_day',
       'event_name_1_Cinco De Mayo', 'event_name_1_Father's day',
       'event_name_1_MemorialDay', 'event_name_1_Mother's day',
       'ev

Unnamed: 0,Value,Feature
0,21287,item_id
1,4448,month
2,4078,rmean_7_7
3,3955,rmean_7_28
4,3612,rmean_28_28
5,3517,store_id
6,2701,rmean_28_7
7,2579,lag_7
8,2365,week
9,2343,mday


2016-05-23 00:00:00
2016-05-24 00:00:00
2016-05-25 00:00:00
2016-05-26 00:00:00
2016-05-27 00:00:00
2016-05-28 00:00:00
2016-05-29 00:00:00
2016-05-30 00:00:00
2016-05-31 00:00:00
2016-06-01 00:00:00
2016-06-02 00:00:00
2016-06-03 00:00:00
2016-06-04 00:00:00
2016-06-05 00:00:00
2016-06-06 00:00:00
2016-06-07 00:00:00
2016-06-08 00:00:00
2016-06-09 00:00:00
2016-06-10 00:00:00
2016-06-11 00:00:00
2016-06-12 00:00:00
2016-06-13 00:00:00
2016-06-14 00:00:00
2016-06-15 00:00:00
2016-06-16 00:00:00
2016-06-17 00:00:00
2016-06-18 00:00:00
2016-06-19 00:00:00
CPU times: user 7h 14min, sys: 30.6 s, total: 7h 14min 31s
Wall time: 2h 17min 51s


In [8]:
sub_p_total.shape

(30490, 29)

In [9]:
sub_p_total.tail()

d,id,d_1942,d_1943,d_1944,d_1945,d_1946,d_1947,d_1948,d_1949,d_1950,d_1951,d_1952,d_1953,d_1954,d_1955,d_1956,d_1957,d_1958,d_1959,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
8225,FOODS_3_827_TX_2_evaluation,0.452905,0.393299,0.117635,0.123102,0.56263,0.243517,0.342332,0.484096,0.411051,0.461842,0.456677,0.458377,0.452162,0.460642,0.412228,0.389354,0.388544,0.436008,0.458971,0.489249,0.421906,0.385017,0.306454,0.400966,0.396558,0.299902,0.349158,0.327489
8226,FOODS_3_827_TX_3_evaluation,1.531034,1.338524,1.186633,1.203981,1.610346,1.454457,1.209128,1.331635,1.114847,1.365433,1.245394,1.573512,1.695297,1.824165,1.404043,1.2706,1.331796,1.395284,1.514368,1.72466,1.50981,1.190785,0.953507,0.934842,0.916055,1.124167,1.349973,1.247266
8227,FOODS_3_827_WI_1_evaluation,3.145475,2.99561,2.530498,2.738707,3.971953,3.55074,3.033403,3.65272,2.946136,3.061416,2.921038,3.469461,4.463633,4.008035,2.821261,2.807999,2.79552,2.944696,3.633557,4.081847,3.427531,2.358791,2.091673,2.079195,2.116285,2.565604,3.120374,2.698086
8228,FOODS_3_827_WI_2_evaluation,1.802031,1.721998,1.538359,1.524689,1.903209,1.762669,1.358076,1.910003,1.690071,1.864871,2.271737,2.426931,2.404684,2.719897,2.282636,1.83564,2.013013,2.196629,1.972379,2.571123,2.168764,1.489603,1.497055,1.570503,1.292173,1.450243,1.756658,1.496431
8229,FOODS_3_827_WI_3_evaluation,1.19402,1.193381,0.914131,1.193602,1.493669,1.293505,0.91843,1.21431,0.979346,0.932469,1.214255,1.522702,1.538968,1.761719,1.315104,1.078254,1.168156,1.30583,1.231271,1.739784,1.445818,0.962981,1.013298,1.054786,0.931416,1.026663,1.381871,1.203128


In [10]:
sub = pd.read_csv('./raw_data/sample_submission.csv', usecols=['id'])

In [11]:
sub = sub.merge(sub_p_total, on='id', how='left')

In [12]:
sub = sub.dropna()

In [13]:
sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.columns = ['id'] + ['F' + str(c) for c in np.arange(1,29,1)]
sub.to_csv("./proc_data/partial_submission.csv",index=False)