In [3]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os
/Users/konstantinostsoum/Downloads/supply-chain-datasets/DataCoSupplyChainDataset.csv

In [98]:
# Get the file path from an environment variable
file_path = os.environ.get("CSV_FILE_PATH")

if file_path:
    supply_chain_data = pd.read_csv(file_path)
else:
    file_path = input("Enter the CSV file path: ")
    supply_chain_data = pd.read_csv(file_path, encoding='iso-8859-1')

Enter the CSV file path: /Users/konstantinostsoum/Downloads/supply-chain-datasets/DataCoSupplyChainDataset.csv


### Gathering information

In [99]:
# Get a first glance of the dataset
supply_chain_data.shape

(180519, 53)

In [100]:
# Columns and data types
supply_chain_data.dtypes

Type                              object
Days for shipping (real)           int64
Days for shipment (scheduled)      int64
Benefit per order                float64
Sales per customer               float64
Delivery Status                   object
Late_delivery_risk                 int64
Category Id                        int64
Category Name                     object
Customer City                     object
Customer Country                  object
Customer Email                    object
Customer Fname                    object
Customer Id                        int64
Customer Lname                    object
Customer Password                 object
Customer Segment                  object
Customer State                    object
Customer Street                   object
Customer Zipcode                 float64
Department Id                      int64
Department Name                   object
Latitude                         float64
Longitude                        float64
Market          

In [101]:
# Concatenate customer's Full name
supply_chain_data['Customer Full Name'] = supply_chain_data['Customer Fname'].astype(str)+supply_chain_data['Customer Lname'].astype(str)
# Remove irrelevant features
supply_chain_data=supply_chain_data.drop(['Customer Email','Product Status','Customer Password','Customer Street','Customer Fname','Customer Lname',
           'Product Description','Product Image','Order Zipcode','shipping date (DateOrders)'],axis=1)
supply_chain_data.shape

(180519, 44)

In [102]:
# Missing values check
supply_chain_data.isna().sum()

Type                             0
Days for shipping (real)         0
Days for shipment (scheduled)    0
Benefit per order                0
Sales per customer               0
Delivery Status                  0
Late_delivery_risk               0
Category Id                      0
Category Name                    0
Customer City                    0
Customer Country                 0
Customer Id                      0
Customer Segment                 0
Customer State                   0
Customer Zipcode                 3
Department Id                    0
Department Name                  0
Latitude                         0
Longitude                        0
Market                           0
Order City                       0
Order Country                    0
Order Customer Id                0
order date (DateOrders)          0
Order Id                         0
Order Item Cardprod Id           0
Order Item Discount              0
Order Item Discount Rate         0
Order Item Id       

Surprisingly, there are 3 (I guess random) missing values in the "Customer Zipcode" column. 


Since this refers to a zipcode, I'll try to search in what country, city, state these customers belong to as well as trying to find the zipcode using coordinates.  

In [103]:
# Show rows that contain missing values ("Customer Zipcode" column)
supply_chain_data[supply_chain_data['Customer Zipcode'].isna()][["Customer City", "Customer Country", "Customer Segment", "Customer State", "Latitude", "Longitude"]]

Unnamed: 0,Customer City,Customer Country,Customer Segment,Customer State,Latitude,Longitude
35704,CA,EE. UU.,Consumer,95758,39.134159,-123.718552
46440,CA,EE. UU.,Corporate,95758,38.408573,-121.449112
82511,CA,EE. UU.,Consumer,91732,34.071892,-118.013886


In [104]:
# Define the rows and values for imputation
rows_to_impute = [35704, 46440, 82511]
imputed_zipcode_values = [5801, 95758, 12014] 

# Impute values
for row_idx, value in zip(rows_to_impute, imputed_zipcode_values):
    supply_chain_data.at[row_idx, "Customer Zipcode"] = value
    
# Remove Latitude and Longitude as we do not need them anymore
supply_chain_data=supply_chain_data.drop(["Latitude", "Longitude"], axis =1)

In [105]:
# Transposed the dataframe due to the amount of columns we got
supply_chain_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Days for shipping (real),180519.0,3.497654,1.623722,0.0,2.0,3.0,5.0,6.0
Days for shipment (scheduled),180519.0,2.931847,1.374449,0.0,2.0,4.0,4.0,4.0
Benefit per order,180519.0,21.974989,104.433526,-4274.97998,7.0,31.52,64.800003,911.799988
Sales per customer,180519.0,183.107609,120.04367,7.49,104.379997,163.990005,247.399994,1939.98999
Late_delivery_risk,180519.0,0.548291,0.497664,0.0,0.0,1.0,1.0,1.0
Category Id,180519.0,31.851451,15.640064,2.0,18.0,29.0,45.0,76.0
Customer Id,180519.0,6691.379495,4162.918106,1.0,3258.5,6457.0,9779.0,20757.0
Customer Zipcode,180519.0,35921.159097,37542.522425,603.0,725.0,19380.0,78207.0,99205.0
Department Id,180519.0,5.44346,1.629246,2.0,4.0,5.0,7.0,12.0
Order Customer Id,180519.0,6691.379495,4162.918106,1.0,3258.5,6457.0,9779.0,20757.0


### Exploratory Data Analysis

Since we have 42 columns, it would be amazing to visualize them all (both separately and not). 

However, we have to be strategic to avoid getting lost in the details. 

In the end, we can always create a visual later if something sparks our interest or we may tailor made the dataset for different purposes.