# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import xgboost as xgb
import scipy
import matplotlib.pyplot as plt

In [2]:
!pip install pandas
!pip install numpy
!pip install xgboost
!pip install matplotlib
!pip install scipy
!pip install scikit-learn



In [5]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [6]:
vanilla_train_df = pd.read_csv('/content/train.csv')
vanilla_test_df = pd.read_csv('/content/test.csv')

In [24]:
days = {
    "Monday": 1,
    "Tuesday": 2,
    "Wednesday": 3,
    "Thursday": 4,
    "Friday": 5,
    "Saturday": 6,
    "Sunday": 7
}

custom_trip_types = {
    3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 12:7, 14:8, 15:9, 18:10,
    19:11, 20:12, 21:13, 22:14, 23:15, 24:16, 25:17, 26:18, 27:19,
    28:20, 29:21, 30:22, 31:23, 32:24, 33:25, 34:26, 35:27, 36:28,
    37:29, 38:30, 39:31, 40:32, 41:33, 42:34, 43:35, 44:36, 999:37
}

def get_original_trip_types(transformed_trip_types):
    original_tt = {
        custom_trip_types[k]: k for k in custom_trip_types
    }
    return transformed_trip_types.map(original_tt)

def transform_data(vanilla_train, vanilla_test):
    train_df = vanilla_train
    test_df = vanilla_test
    # set if train or test
    train_df['is_train_set'] = 1
    test_df['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by
    # VisitNumber and then we get the max (or min or avg)
    y = train_df.groupby(
        ["VisitNumber", "Weekday"], as_index=False).max().TripType
    # parse original trip types to our cool trip types
    y = y.map(custom_trip_types)

    # drop TripType because test_df does not have it
    train_df = train_df.drop("TripType", axis=1)
    data = pd.concat([train_df, test_df])

    # Get dummies for departments
    dummies = pd.get_dummies(data.DepartmentDescription)
    data[dummies.columns] = dummies
    data['Weekday'] = data['Weekday'].map(days)

    data_dummies = data.iloc[:,7:]
    data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
    data_dummies = data_dummies.replace(-0,0)

    # Get if element is returned or not
    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0

    # Get useful columns and change ScanCount for NumItems
    data = data[[
        "VisitNumber", "Weekday", "ScanCount", "Return", "is_train_set"]]
    data = data.rename(columns={"ScanCount":"NumItems"})
    data = pd.concat([data, data_dummies], axis=1)

    # Set all features and a method to get the grouping method to use in each
    # of the features. (np.sum if is department feature and np.max if not.
    non_dept_features = ['Weekday', 'NumItems', 'Return', 'is_train_set', 'VisitNumber']
    features = non_dept_features + list(dummies.columns)
    get_grouped_method = lambda f : np.max if f in non_dept_features else np.sum

    # Get all the features and the group method used in each one in a nice dict
    group_methods = {k: get_grouped_method(k) for k in features}
    # Group them once and for all using VisitNumber
    grouped = data.groupby("VisitNumber")
    grouped = grouped.agg(group_methods)

    data = grouped[features]

    # get train and test back
    train_df = data[data.is_train_set != 0]
    test_df = data[data.is_train_set == 0]

    res_train_df = train_df.drop(["is_train_set"], axis=1)
    res_test_df = test_df.drop(["is_train_set"], axis=1)

    res_train_df['TripType'] = y

    return res_train_df, res_test_df

def transform_data_train(vanilla_train):
    data = vanilla_train

    # Get dummies for departments
    dummies = pd.get_dummies(data.DepartmentDescription)
    data[dummies.columns] = dummies
    data['Weekday'] = data['Weekday'].map(days)

    data_dummies = data.iloc[:,7:]
    data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
    data_dummies = data_dummies.replace(-0,0)

    # Get if element is returned or not
    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0

    # Get useful columns and change ScanCount for NumItems
    data = data[[
        "TripType", "VisitNumber", "Weekday", "ScanCount", "Return"]]
    data = data.rename(columns={"ScanCount":"NumItems"})
    data = pd.concat([data, data_dummies], axis=1)

    # Set all features and a method to get the grouping method to use in each
    # of the features. (np.sum if is department feature and np.max if not.
    non_dept_features = ['TripType', 'Weekday', 'NumItems', 'Return']
    features = non_dept_features + list(dummies.columns)
    features.remove('1-HR PHOTO')
    get_grouped_method = lambda f : np.max if f in non_dept_features else np.sum

    # Get all the features and the group method used in each one in a nice dict
    group_methods = {k: get_grouped_method(k) for k in features}
    # Group them once and for all using VisitNumber
    grouped = data.groupby("VisitNumber")
    grouped = grouped.agg(group_methods)

    data = grouped[features]
    data["TripType"] = data["TripType"].map(custom_trip_types)

    return data

def transform_data_test(vanilla_test):
    data = vanilla_test

    # Get dummies for departments
    dummies = pd.get_dummies(data.DepartmentDescription)
    data[dummies.columns] = dummies
    data['Weekday'] = data['Weekday'].map(days)

    data_dummies = data.iloc[:,7:]
    data_dummies = data_dummies.apply(lambda x: x*data["ScanCount"])
    data_dummies = data_dummies.replace(-0,0)

    # Get if element is returned or not
    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0

    # Get useful columns and change ScanCount for NumItems
    data = data[[
        "VisitNumber", "Weekday", "ScanCount", "Return"]]
    data = data.rename(columns={"ScanCount":"NumItems"})
    data = pd.concat([data, data_dummies], axis=1)

    # Set all features and a method to get the grouping method to use in each
    # of the features. (np.sum if is department feature and np.max if not.
    non_dept_features = ['Weekday', 'NumItems', 'Return']
    features = non_dept_features + list(dummies.columns)
    features.remove('1-HR PHOTO')
    get_grouped_method = lambda f : np.max if f in non_dept_features else np.sum

    # Get all the features and the group method used in each one in a nice dict
    group_methods = {k: get_grouped_method(k) for k in features}
    # Group them once and for all using VisitNumber
    grouped = data.groupby("VisitNumber")
    grouped = grouped.agg(group_methods)

    data = grouped[features]

    return data

Load the data...

In [25]:
res_train_df = transform_data_train(vanilla_train_df)
res_test_df = transform_data_test(vanilla_test_df)

Create the model and evaluate it

In [26]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(res_train_df, test_size = .4)

In [27]:
real_test_matrix = scipy.sparse.csr_matrix(res_test_df.values)

In [28]:
res_train_df.columns

Index(['TripType', 'Weekday', 'NumItems', 'Return', 'ACCESSORIES',
       'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER', 'BEAUTY', 'BEDDING',
       'BOOKS AND MAGAZINES', 'BOYS WEAR', 'BRAS & SHAPEWEAR',
       'CAMERAS AND SUPPLIES', 'CANDY, TOBACCO, COOKIES', 'CELEBRATION',
       'COMM BREAD', 'CONCEPT STORES', 'COOK AND DINE', 'DAIRY', 'DSD GROCERY',
       'ELECTRONICS', 'FABRICS AND CRAFTS', 'FINANCIAL SERVICES',
       'FROZEN FOODS', 'FURNITURE', 'GIRLS WEAR, 4-6X  AND 7-14',
       'GROCERY DRY GOODS', 'HARDWARE', 'HEALTH AND BEAUTY AIDS', 'HOME DECOR',
       'HOME MANAGEMENT', 'HORTICULTURE AND ACCESS',
       'HOUSEHOLD CHEMICALS/SUPP', 'HOUSEHOLD PAPER GOODS',
       'IMPULSE MERCHANDISE', 'INFANT APPAREL', 'INFANT CONSUMABLE HARDLINES',
       'JEWELRY AND SUNGLASSES', 'LADIES SOCKS', 'LADIESWEAR',
       'LARGE HOUSEHOLD GOODS', 'LAWN AND GARDEN', 'LIQUOR,WINE,BEER',
       'MEAT - FRESH & FROZEN', 'MEDIA AND GAMING', 'MENS WEAR', 'MENSWEAR',
       'OFFICE SUPPLIES', 'OPTI

In [29]:
[x for x in res_train_df.columns if x not in res_test_df.columns]

['TripType', 'HEALTH AND BEAUTY AIDS']

In [30]:
train_matrix = xgb.DMatrix(
  np.asarray(train_df.drop(["TripType", "HEALTH AND BEAUTY AIDS"], axis=1)),
  label = np.asarray(train_df.TripType)
)
test_matrix = xgb.DMatrix(
  np.asarray(test_df.drop(["TripType", "HEALTH AND BEAUTY AIDS"], axis=1)),
  label = np.asarray(test_df.TripType)
)
real_test_matrix = xgb.DMatrix(np.asarray(res_test_df))

Setting Paramaters

In [31]:
num_round = 200
param = {'objective': 'multi:softprob', 'num_class':38, 
     'eval_metric': 'mlogloss', "max_delta_step": 5}
watchlist = [(train_matrix,'train'), (test_matrix, 'eval')]

In [32]:
len(train_df.columns)

71

Training the Model

In [33]:
xgb_model = xgb.train(param, train_matrix, num_round, watchlist, 
            early_stopping_rounds=3)

[0]	train-mlogloss:2.54171	eval-mlogloss:2.58223
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 3 rounds.
[1]	train-mlogloss:2.06187	eval-mlogloss:2.13229
[2]	train-mlogloss:1.81176	eval-mlogloss:1.90327
[3]	train-mlogloss:1.64416	eval-mlogloss:1.74989
[4]	train-mlogloss:1.51947	eval-mlogloss:1.63835
[5]	train-mlogloss:1.42127	eval-mlogloss:1.55089
[6]	train-mlogloss:1.34233	eval-mlogloss:1.48266
[7]	train-mlogloss:1.27776	eval-mlogloss:1.42736
[8]	train-mlogloss:1.22371	eval-mlogloss:1.38169
[9]	train-mlogloss:1.17774	eval-mlogloss:1.34376
[10]	train-mlogloss:1.13788	eval-mlogloss:1.31184
[11]	train-mlogloss:1.10343	eval-mlogloss:1.28493
[12]	train-mlogloss:1.07222	eval-mlogloss:1.26112
[13]	train-mlogloss:1.04533	eval-mlogloss:1.24123
[14]	train-mlogloss:1.02179	eval-mlogloss:1.22369
[15]	train-mlogloss:1.00017	eval-mlogloss:1.20832
[16]	train-mlogloss:0.980769	eval-mlogloss:1.195
[17]	train-

**And finally**, we predict the unknown label for the testing set

In [46]:
test_predictions = xgb_model.predict(real_test_matrix)

AttributeError: ignored

In [35]:
test_predictions

array([[3.60607752e-04, 2.72978432e-05, 2.34291700e-04, ...,
        1.34299137e-02, 2.27973401e-03, 1.59524586e-02],
       [8.62195975e-06, 6.58233239e-06, 6.09846793e-05, ...,
        1.97522272e-03, 2.07171366e-02, 3.68040812e-04],
       [2.59405824e-05, 5.08079665e-06, 2.41589732e-05, ...,
        6.71098940e-04, 9.87405510e-05, 2.10534432e-03],
       ...,
       [2.63834834e-01, 3.65626802e-05, 4.38308256e-04, ...,
        2.17765337e-03, 4.67205857e-04, 1.58838555e-02],
       [5.96900122e-07, 8.44294729e-04, 3.97432502e-03, ...,
        7.89197220e-05, 7.95963686e-04, 2.56604853e-05],
       [4.80985591e-07, 8.76643637e-04, 8.28466099e-03, ...,
        9.55477444e-05, 5.38658118e-04, 1.15901785e-04]], dtype=float32)

The last thing we do is generating a file that should be *submitted* on kaggle

In [56]:
def predictions_to_csv(test_predictions):
    test_predictions = pd.DataFrame(test_predictions)
    test_indexes = res_test_df.index
    test_predictions = test_predictions.rename(columns = {0:3, 1: 4, 2: 5, 3: 6, 
                4: 7, 5: 8, 6: 9, 7: 12, 8: 14, 
                9: 15, 10: 18, 11: 19, 12: 20, 13: 21,
                14: 22, 15: 23, 16: 24, 17: 25, 18: 26, 
                19: 27, 20: 28, 21: 29, 22: 30, 23: 31, 
                24: 32, 25: 33, 26: 34, 27: 35, 28: 36, 
                29: 37, 30: 38, 31: 39, 32: 40, 33: 41, 
                34: 42, 35: 43, 36: 44, 37: 999})
    test_predictions['TripType'] = test_predictions.idxmax(axis=1)
    test_predictions.insert(0, 'VisitNumber', test_indexes)

    test_predictions[['VisitNumber', 'TripType']].to_csv("/content/submission.csv", index=False)

    return test_predictions[['VisitNumber', 'TripType']]

In [57]:
submission = predictions_to_csv(test_predictions)

In [58]:
submission

Unnamed: 0,VisitNumber,TripType
0,7,30
1,8,26
2,15,21
3,19,42
4,23,24
...,...,...
28640,191331,9
28641,191335,32
28642,191342,8
28643,191345,39
