# Import packages and cleaned dataset

In [10]:
import pandas as pd
import numpy as np

In [11]:
supplyChain = pd.read_csv("data/DataCoSupplyChainDataset.csv", encoding='ISO-8859-1')

# Data Cleaning and Preprocessing

In [12]:
supplyChain_clean = supplyChain.drop(columns=['Days for shipping (real)', 'Delivery Status', 'Late_delivery_risk', 
                                        'shipping date (DateOrders)', 'Benefit per order', 'Sales per customer', 'Category Id',
                                        'Order Profit Per Order', 'Order Item Discount', 'Order Item Total', 'Order Status', 
                                        'Customer Email', 'Customer Password', 'Latitude', 'Longitude', 'Product Description', 'Product Image',
                                        'Customer Fname', 'Customer Id', 'Customer Lname', 'Department Id',
                                             'Order Customer Id', 'Order Item Cardprod Id', 'Order Item Id',
                                             'Product Card Id', 'Product Category Id', 'Order Id', 'Customer Street',
                                             'Customer Zipcode', 'Order Zipcode', 'Order Item Product Price',
                                             'Product Price', 'Order Item Profit Ratio', 'Product Status'])

In [13]:
supplyChain_clean.columns

Index(['Type', 'Days for shipment (scheduled)', 'Category Name',
       'Customer City', 'Customer Country', 'Customer Segment',
       'Customer State', 'Department Name', 'Market', 'Order City',
       'Order Country', 'order date (DateOrders)', 'Order Item Discount Rate',
       'Order Item Quantity', 'Sales', 'Order Region', 'Order State',
       'Product Name', 'Shipping Mode'],
      dtype='object')

In [14]:
categorical_cols = ['Type', 'Category Name', 'Customer City', 'Customer Country',
       'Customer Segment', 'Customer State', 'Department Name', 'Market',
       'Order City', 'Order Country',
       'Order Region', 'Order State', 'Product Name', 'Shipping Mode', "Year", "Month"]

In [15]:
# Date and Time Features 
supplyChain_clean['order date (DateOrders)'] = pd.to_datetime(supplyChain_clean['order date (DateOrders)']) 
#supplyChain_clean['Day of Week'] = supplyChain_clean['order date (DateOrders)'].dt.dayofweek 
supplyChain_clean['Month'] = supplyChain_clean['order date (DateOrders)'].dt.month 
supplyChain_clean['Year'] = supplyChain_clean['order date (DateOrders)'].dt.year 
#supplyChain_clean['Week of Year'] = supplyChain_clean['order date (DateOrders)'].dt.isocalendar().week 

# Assuming supplyChain_clean is your DataFrame
supplyChain_clean['order date (DateOrders)'] = pd.to_datetime(supplyChain_clean['order date (DateOrders)'])

# Sorting the DataFrame by the 'order date (DateOrders)' column
supplyChain_clean.sort_values(by='order date (DateOrders)', inplace=True)

# Dropping the 'order date (DateOrders)' column
supplyChain_clean.drop(columns=['order date (DateOrders)'], inplace=True)

In [16]:
import pandas as pd

# Grouping the data by the specified variables
grouped_data = supplyChain_clean.groupby(['Type', 'Category Name', 'Customer City', 'Customer Country',
                                            'Customer Segment', 'Customer State', 'Department Name', 'Market',
                                            'Order City', 'Order Country', 'Order Region', 'Order State',
                                            'Product Name', 'Shipping Mode', 'Year', 'Month'])

# Applying aggregation functions to the grouped data
aggregated_data = grouped_data.agg({
    # Aggregation functions for each column
    'Sales': 'sum',
    'Days for shipment (scheduled)': 'mean', 
    'Order Item Discount Rate': 'mean',
    'Order Item Quantity': "sum"
})

# Resetting index to make the DataFrame flat
aggregated_data.reset_index(inplace=True)

In [17]:
aggregated_data

Unnamed: 0,Type,Category Name,Customer City,Customer Country,Customer Segment,Customer State,Department Name,Market,Order City,Order Country,Order Region,Order State,Product Name,Shipping Mode,Year,Month,Sales,Days for shipment (scheduled),Order Item Discount Rate,Order Item Quantity
0,CASH,Accessories,Algonquin,EE. UU.,Consumer,IL,Outdoors,LATAM,Santa Catarina,México,Central America,Nuevo León,Team Golf San Francisco Giants Putter Grip,Standard Class,2017,4,99.959999,4.0,0.12,4
1,CASH,Accessories,Amarillo,EE. UU.,Corporate,TX,Outdoors,USCA,Bloomington,Estados Unidos,US Center,Illinois,Team Golf Tennessee Volunteers Putter Grip,Standard Class,2016,8,49.980000,4.0,0.13,2
2,CASH,Accessories,Arecibo,Puerto Rico,Corporate,PR,Outdoors,Pacific Asia,Singapur,Singapur,Southeast Asia,Singapur,Team Golf St. Louis Cardinals Putter Grip,Standard Class,2015,11,99.959999,4.0,0.25,4
3,CASH,Accessories,Arlington,EE. UU.,Consumer,VA,Outdoors,Africa,Maradi,Níger,West Africa,Maradi,Team Golf Tennessee Volunteers Putter Grip,Standard Class,2016,10,24.990000,4.0,0.13,1
4,CASH,Accessories,Aurora,EE. UU.,Consumer,CO,Outdoors,Europe,Offenburg,Alemania,Western Europe,Baden-Wurtemberg,Team Golf St. Louis Cardinals Putter Grip,Second Class,2015,8,74.970001,2.0,0.05,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157560,TRANSFER,Women's Golf Clubs,Salem,EE. UU.,Corporate,OR,Outdoors,Europe,Pertuis,Francia,Western Europe,Provenza-Alpes-Costa Azul,MDGolf Pittsburgh Penguins Putter,Standard Class,2017,8,319.959992,4.0,0.09,4
157561,TRANSFER,Women's Golf Clubs,San Jose,EE. UU.,Consumer,CA,Outdoors,Europe,Munster,Alemania,Western Europe,Baja Sajonia,TaylorMade White Smoke IN-12 Putter,Second Class,2017,7,299.970001,2.0,0.12,3
157562,TRANSFER,Women's Golf Clubs,San Marcos,EE. UU.,Corporate,TX,Outdoors,Europe,Trier,Alemania,Western Europe,Renania-Palatinado,MDGolf Pittsburgh Penguins Putter,First Class,2017,8,319.959992,1.0,0.17,4
157563,TRANSFER,Women's Golf Clubs,Warren,EE. UU.,Corporate,MI,Outdoors,Europe,Bochum,Alemania,Western Europe,Renania del Norte-Westfalia,Cleveland Golf Collegiate My Custom Wedge 588,Second Class,2017,7,209.990005,2.0,0.10,1


In [18]:
supplyChain_clean = aggregated_data

In [19]:
supplyChain_clean.to_csv("DataCo_cleaned.csv", index = False)

In [14]:
supplyChain_clean_2000 = supplyChain_clean.iloc[:2000]

In [15]:
supplyChain_clean_2000.to_csv("DataCo_cleaned.csv", index = False)

In [16]:
supplyChain_clean_2000

Unnamed: 0,Type,Days for shipment (scheduled),Category Name,Customer City,Customer Country,Customer Segment,Customer State,Department Name,Market,Order City,...,Order Item Quantity,Sales,Order Region,Order State,Product Name,Shipping Mode,Day of Week,Month,Year,Week of Year
33833,CASH,4,Camping & Hiking,Hickory,EE. UU.,Consumer,NC,Fan Shop,LATAM,Mexico City,...,1,299.980011,Central America,Distrito Federal,Diamondback Women's Serene Classic Comfort Bi,Standard Class,3,1,2015,1
77011,PAYMENT,4,Water Sports,Chicago,EE. UU.,Consumer,IL,Fan Shop,LATAM,Dos Quebradas,...,1,199.990005,South America,Risaralda,Pelican Sunstream 100 Kayak,Standard Class,3,1,2015,1
109322,PAYMENT,4,Women's Apparel,Chicago,EE. UU.,Consumer,IL,Golf,LATAM,Dos Quebradas,...,5,250.000000,South America,Risaralda,Nike Men's Dri-FIT Victory Golf Polo,Standard Class,3,1,2015,1
87884,PAYMENT,4,Men's Footwear,Chicago,EE. UU.,Consumer,IL,Apparel,LATAM,Dos Quebradas,...,1,129.990005,South America,Risaralda,Nike Men's CJ Elite 2 TD Football Cleat,Standard Class,3,1,2015,1
114915,CASH,4,Indoor/Outdoor Games,San Antonio,EE. UU.,Home Office,TX,Fan Shop,LATAM,Dos Quebradas,...,4,199.919998,South America,Risaralda,O'Brien Men's Neoprene Life Vest,Standard Class,3,1,2015,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57298,DEBIT,4,Cleats,Caguas,Puerto Rico,Consumer,PR,Apparel,LATAM,Las Tunas,...,4,239.960007,Caribbean,Las Tunas,Perfect Fitness Perfect Rip Deck,Standard Class,0,1,2015,3
22197,DEBIT,4,Electronics,Caguas,Puerto Rico,Consumer,PR,Footwear,LATAM,Las Tunas,...,2,55.980000,Caribbean,Las Tunas,Under Armour Kids' Mercenary Slide,Standard Class,0,1,2015,3
20562,DEBIT,4,Camping & Hiking,Caguas,Puerto Rico,Consumer,PR,Fan Shop,LATAM,Las Tunas,...,1,299.980011,Caribbean,Las Tunas,Diamondback Women's Serene Classic Comfort Bi,Standard Class,0,1,2015,3
58105,DEBIT,4,Shop By Sport,Caguas,Puerto Rico,Consumer,PR,Golf,LATAM,Las Tunas,...,5,199.949997,Caribbean,Las Tunas,Under Armour Girls' Toddler Spine Surge Runni,Standard Class,0,1,2015,3


# Write to Pipeline

In [22]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer for reading data
class DataReader(BaseEstimator, TransformerMixin):
    def __init__(self, filename):
        self.filename = filename
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.read_csv(self.filename, encoding='ISO-8859-1')

# Custom transformer for cleaning data
class DataCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_clean = X.drop(columns=['Days for shipping (real)', 'Delivery Status', 'Late_delivery_risk', 
                                  'shipping date (DateOrders)', 'Benefit per order', 'Sales per customer', 'Category Id',
                                  'Order Profit Per Order', 'Order Item Discount', 'Order Item Total', 'Order Status', 
                                  'Customer Email', 'Customer Password', 'Latitude', 'Longitude', 'Product Description', 'Product Image',
                                  'Customer Fname', 'Customer Id', 'Customer Lname', 'Department Id',
                                  'Order Customer Id', 'Order Item Cardprod Id', 'Order Item Id',
                                  'Product Card Id', 'Product Category Id', 'Order Id', 'Customer Street',
                                  'Customer Zipcode', 'Order Zipcode', 'Order Item Product Price',
                                  'Product Price', 'Order Item Profit Ratio', 'Product Status'])
        X_clean['order date (DateOrders)'] = pd.to_datetime(X_clean['order date (DateOrders)'])
        X_clean['Year'] = X_clean['order date (DateOrders)'].dt.year
        X_clean['Month'] = X_clean['order date (DateOrders)'].dt.month
        X_clean.sort_values(by='order date (DateOrders)', inplace=True)
        X_clean.drop(columns=['order date (DateOrders)'], inplace=True)
        return X_clean

# Custom transformer for aggregating data
class DataAggregator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        grouped_data = X.groupby(['Type', 'Category Name', 'Customer City', 'Customer Country',
                                  'Customer Segment', 'Customer State', 'Department Name', 'Market',
                                  'Order City', 'Order Country', 'Order Region', 'Order State',
                                  'Product Name', 'Shipping Mode', 'Year', 'Month'])
        aggregated_data = grouped_data.agg({
            'Sales': 'sum',
            'Days for shipment (scheduled)': 'mean', 
            'Order Item Discount Rate': 'mean',
            'Order Item Quantity': 'sum'
        }).reset_index()
        return aggregated_data

# Custom transformer for saving data
class DataSaver(BaseEstimator, TransformerMixin):
    def __init__(self, filename):
        self.filename = filename
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.to_csv(self.filename, index=False)
        return X

# Define the pipeline
pipeline = Pipeline([
    ('read_data', DataReader(filename="data/DataCoSupplyChainDataset.csv")),
    ('clean_data', DataCleaner()),
    ('aggregate_data', DataAggregator()),
    ('save_data', DataSaver(filename="DataCo_cleaned.csv"))
])

# Execute the pipeline
pipeline.fit_transform(None)


Unnamed: 0,Type,Category Name,Customer City,Customer Country,Customer Segment,Customer State,Department Name,Market,Order City,Order Country,Order Region,Order State,Product Name,Shipping Mode,Year,Month,Sales,Days for shipment (scheduled),Order Item Discount Rate,Order Item Quantity
0,CASH,Accessories,Algonquin,EE. UU.,Consumer,IL,Outdoors,LATAM,Santa Catarina,México,Central America,Nuevo León,Team Golf San Francisco Giants Putter Grip,Standard Class,2017,4,99.959999,4.0,0.12,4
1,CASH,Accessories,Amarillo,EE. UU.,Corporate,TX,Outdoors,USCA,Bloomington,Estados Unidos,US Center,Illinois,Team Golf Tennessee Volunteers Putter Grip,Standard Class,2016,8,49.980000,4.0,0.13,2
2,CASH,Accessories,Arecibo,Puerto Rico,Corporate,PR,Outdoors,Pacific Asia,Singapur,Singapur,Southeast Asia,Singapur,Team Golf St. Louis Cardinals Putter Grip,Standard Class,2015,11,99.959999,4.0,0.25,4
3,CASH,Accessories,Arlington,EE. UU.,Consumer,VA,Outdoors,Africa,Maradi,Níger,West Africa,Maradi,Team Golf Tennessee Volunteers Putter Grip,Standard Class,2016,10,24.990000,4.0,0.13,1
4,CASH,Accessories,Aurora,EE. UU.,Consumer,CO,Outdoors,Europe,Offenburg,Alemania,Western Europe,Baden-Wurtemberg,Team Golf St. Louis Cardinals Putter Grip,Second Class,2015,8,74.970001,2.0,0.05,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157560,TRANSFER,Women's Golf Clubs,Salem,EE. UU.,Corporate,OR,Outdoors,Europe,Pertuis,Francia,Western Europe,Provenza-Alpes-Costa Azul,MDGolf Pittsburgh Penguins Putter,Standard Class,2017,8,319.959992,4.0,0.09,4
157561,TRANSFER,Women's Golf Clubs,San Jose,EE. UU.,Consumer,CA,Outdoors,Europe,Munster,Alemania,Western Europe,Baja Sajonia,TaylorMade White Smoke IN-12 Putter,Second Class,2017,7,299.970001,2.0,0.12,3
157562,TRANSFER,Women's Golf Clubs,San Marcos,EE. UU.,Corporate,TX,Outdoors,Europe,Trier,Alemania,Western Europe,Renania-Palatinado,MDGolf Pittsburgh Penguins Putter,First Class,2017,8,319.959992,1.0,0.17,4
157563,TRANSFER,Women's Golf Clubs,Warren,EE. UU.,Corporate,MI,Outdoors,Europe,Bochum,Alemania,Western Europe,Renania del Norte-Westfalia,Cleveland Golf Collegiate My Custom Wedge 588,Second Class,2017,7,209.990005,2.0,0.10,1
