In [1]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [2]:
class ATCDataset_v2(Dataset):
    
    def __init__(self, in_data_path, out_data_path):
        """
        Args:
            data_path (str): path to the data in csv format
        """
        self.df = pd.read_csv(
            in_data_path,
            delimiter=",",
            header=0
            )
        self.out_df = pd.read_csv(
            out_data_path,
            delimiter=",",
            header=0
            )
        
        # build OUTPUT DATA for the loss function
        # ========================================
        # labels for vector of the ouput boxes 
        labels = [
            "OM0000", "OM0001", "OM0002", "OM0003", "OM0004", "OM0005", "OM0006", "OM0007", 
            "OM0008", "OM0009", "OM0010", "PAL01", "PAL02", "PAL03", "PAL04", "PAL05", "PAL06", 
            "PAL07"
        ]

        # result vector contains number of occurences of each box for each group of items
        result_vector = {}
        for group in self.out_df['GroupDelivery'].unique():
            result_vector[group] = np.zeros((labels.__len__()))
            
        for _, row in self.out_df.iterrows():
            cartonName = row['UsedCarton'].strip().upper()
            result_vector[row['GroupDelivery']][labels.index(cartonName)] += 1
        self.result_vector = result_vector
        
        # build INPUT DATA 
        # ========================================
        
        # Apply the function to each GroupDelivery
        def vectorize(group):
            return group[['X', 'Y', 'Z', 'Weight', 'Qty']].values.flatten().tolist()
        
        grouped = self.df.groupby('GroupDelivery').apply(vectorize, include_groups=False).reset_index()

        
        # Rename columns
        grouped.columns = ['GroupDelivery', 'Vector']
        
        # store vectors and group delivery
        self.vectors = grouped.Vector
        self.groupDelivery = grouped.GroupDelivery

In [3]:
def create_feature_vector():
    pass

In [4]:
TEST_IN="data/test_in.csv"
TEST_OUT="data/test_out.csv"

data_test = ATCDataset_v2(TEST_IN, TEST_OUT)

In [5]:
data_test.df

Unnamed: 0,GroupDelivery,Product,X,Y,Z,Weight,Qty
0,12012132,734634,9.0,6.4,18.8,0.660,2
1,12012132,734372,9.3,6.7,19.4,0.664,1
2,12012133,734372,9.3,6.7,19.4,0.664,2
3,12012133,734634,9.0,6.4,18.8,0.660,1
4,12012134,734634,9.0,6.4,18.8,0.660,1
...,...,...,...,...,...,...,...
133930,12049505,691595,8.0,3.1,16.5,0.276,2
133931,12049506,709274,18.5,2.1,11.2,0.082,1
133932,12049508,519998,24.7,1.7,16.0,0.088,1
133933,12049509,305990,15.0,5.3,11.5,0.120,1


In [26]:
def lm(group):
    # x_sum,y_sum,z_sum,w_sum = group[['X','Y','Z','Weight']].values.sum()
    gr = group[['X','Y','Z','Weight']]
    x_sum,y_sum,z_sum,w_sum = gr.sum(numeric_only=True, axis=0)
    return pd.Series([x_sum, y_sum, z_sum, w_sum], index=['X_sum', 'Y_sum', 'Z_sum', 'Weight_sum'])
    
d = data_test.df.groupby('GroupDelivery').apply(func=lm, include_groups=False).reset_index()

In [27]:
d

Unnamed: 0,GroupDelivery,X_sum,Y_sum,Z_sum,Weight_sum
0,12012132,18.3,13.1,38.2,1.324
1,12012133,18.3,13.1,38.2,1.324
2,12012134,18.3,13.1,38.2,1.324
3,12012135,47.9,16.9,62.4,1.580
4,12012136,18.3,13.1,38.2,1.324
...,...,...,...,...,...
30132,12049504,15.2,3.5,16.1,0.078
30133,12049505,8.0,3.1,16.5,0.276
30134,12049506,18.5,2.1,11.2,0.082
30135,12049508,24.7,1.7,16.0,0.088
