# Graph Analytics: Project

/

#### 2021/2022
#### Lorenzo Pigozzi

In [1]:
# importing the libraries
import pandas as pd
import numpy as np 
import networkx as nx
import itertools
import collections
import random
import matplotlib.pyplot as plt

## 1. Importing the data

In [2]:
# importing the datasets
brazil = pd.read_csv('ACTIVITY_TABLE_BR.csv')
# germany = pd.read_csv('ACTIVITY_TABLE_DE.csv')
# portugal = pd.read_csv('ACTIVITY_TABLE_PT.csv')
# thailand = pd.read_csv('ACTIVITY_TABLE_TH.csv')

In [3]:
# to remove in future
df = brazil.copy()

In [4]:
df.head()

Unnamed: 0,ACTIVITY,TIMESTAMP,SALES_DOC,SALES_DOC_ITEM,COUNTRY
0,Create Sales Order,2021-10-04 16:22:36,50043243,10,BR
1,Create Sales Order Item,2021-10-04 16:22:36,50043243,10,BR
2,Material Availability Date passed,2021-10-04 00:00:00,50043243,10,BR
3,Create Sales Order,2021-10-04 16:22:36,50043243,20,BR
4,Create Sales Order Item,2021-10-04 16:22:36,50043243,20,BR


In [5]:
df.shape

(1837252, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1837252 entries, 0 to 1837251
Data columns (total 5 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   ACTIVITY        object
 1   TIMESTAMP       object
 2   SALES_DOC       int64 
 3   SALES_DOC_ITEM  int64 
 4   COUNTRY         object
dtypes: int64(2), object(3)
memory usage: 70.1+ MB


## 2. Data exploration

In [5]:
# changing data type for SO number and Item number
df['SALES_DOC'] = df['SALES_DOC'].astype(str)
df['SALES_DOC_ITEM'] = df['SALES_DOC_ITEM'].astype(str)

In [6]:
# creating the CASE KEY, combination of Sales Order and item
df['CASE_KEY'] = df['SALES_DOC'] + df['SALES_DOC_ITEM']

In [7]:
df.head()

Unnamed: 0,ACTIVITY,TIMESTAMP,SALES_DOC,SALES_DOC_ITEM,COUNTRY,CASE_KEY
0,Create Sales Order,2021-10-04 16:22:36,50043243,10,BR,5004324310
1,Create Sales Order Item,2021-10-04 16:22:36,50043243,10,BR,5004324310
2,Material Availability Date passed,2021-10-04 00:00:00,50043243,10,BR,5004324310
3,Create Sales Order,2021-10-04 16:22:36,50043243,20,BR,5004324320
4,Create Sales Order Item,2021-10-04 16:22:36,50043243,20,BR,5004324320
5,Material Availability Date passed,2021-10-04 00:00:00,50043243,20,BR,5004324320
6,Create Sales Order,2021-10-04 16:25:41,50043244,10,BR,5004324410
7,Create Sales Order Item,2021-10-04 16:25:41,50043244,10,BR,5004324410
8,Material Availability Date passed,2021-10-04 00:00:00,50043244,10,BR,5004324410
9,Create Sales Order,2021-10-04 16:25:41,50043244,20,BR,5004324420


In [8]:
# number of distinct SO in the log
print('Number of distinct SO in the log: ', len(df['SALES_DOC'].unique()))

Number of distinct SO in the log:  90205


In [9]:
# number of distinct cases in the log
print('Number of distinct cases in the log: ', len(df['CASE_KEY'].unique()))

Number of distinct cases in the log:  365032


In [10]:
df.columns

Index(['ACTIVITY', 'TIMESTAMP', 'SALES_DOC', 'SALES_DOC_ITEM', 'COUNTRY',
       'CASE_KEY'],
      dtype='object')

In [11]:
df['ACTIVITY'].unique()

array(['Create Sales Order', 'Create Sales Order Item',
       'Material Availability Date passed', 'Record Goods Issue',
       'Create Delivery', 'Create Invoice', 'Create Picking',
       'Change Material', 'Change Price', 'Change Payment Terms',
       'Create Purchase Order', 'Create Pro forma invoice',
       'Cancel Goods Issue', 'Record Return Goods Receipt',
       'Create Returns delivery for order', 'Create Credit memo',
       'Clear Invoice', 'Change Requested Quantity',
       'Set Reason for Rejection', 'Change Inco Terms (Part 2)',
       'Create Invoice cancellation', 'Change Inco Terms (Part 1)',
       'Change Requested Goods Issue Date',
       'Change Material Availability Date', 'Customer Acceptance (L650)',
       'Change Requested Delivery Date', 'Change Confirmed Delivery Date',
       'Change Confirmed Goods Issue Date', 'Change Confirmed Quantity',
       'Set Initial Delivery Block', 'Change Item Category',
       'Remove Delivery Block', 'Change Shipping Po

## 3. Data Engineering

### 3.1. Weight calculation

In [None]:
df.info()

In [12]:
## to obtain the weights of the graph, it is necessary to engineer and make calculation for the time

# changing the date type
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], format='%Y-%m-%d %H:%M:%S')

# calculating the time difference after sorting by timestamp and grouping by CASE_KEY
df['diff'] = df.sort_values('TIMESTAMP', ascending = True).groupby('CASE_KEY')['TIMESTAMP'].diff()


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020FDA1B4970>

In [32]:
# weight_hour: getting the difference in seconds and dividing for 3600 to get the number of hours
# 3600 seconds = 1 hour
df['weight_hour'] = df['diff'].dt.total_seconds() / 3600

In [38]:
# dropping unnecessary columns
df.drop(columns=['SALES_DOC', 'SALES_DOC_ITEM', 'diff'], inplace = True)

In [39]:
df.sort_values(['CASE_KEY', 'TIMESTAMP'], ascending = True).head(50)

Unnamed: 0,ACTIVITY,TIMESTAMP,COUNTRY,CASE_KEY,weight_hour
1044006,Create Sales Order,2020-07-24 00:30:07,BR,112632797410,
1044007,Create Sales Order Item,2020-07-24 00:30:07,BR,112632797410,0.0
1044008,Material Availability Date passed,2020-10-21 00:00:00,BR,112632797410,2135.498056
1044009,Create Sales Order,2020-07-24 00:30:07,BR,112632797420,
1044016,Create Sales Order Item,2020-07-24 00:30:07,BR,112632797420,0.0
1044015,Material Availability Date passed,2020-10-21 00:00:00,BR,112632797420,2135.498056
1044010,Create Sales Order,2020-07-24 00:30:07,BR,112632797430,0.0
1044011,Create Sales Order Item,2020-07-24 00:30:07,BR,112632797430,
1044012,Material Availability Date passed,2020-10-21 00:00:00,BR,112632797430,2135.498056
1044013,Create Sales Order,2020-07-24 00:30:07,BR,112632797440,


Note: explore https://pm4py.fit.fraunhofer.de/

In [41]:
# ## From the lab5 notebook

# documents_dict= dict()
# for document in df.SALES_DOC_ITEM.unique():
#     documents_dict[document]=len(df[df.SALES_DOC_ITEM == document].ACTIVITY.unique())

# documents_dict

In [42]:
# #number of different activities per document
# import matplotlib.pyplot as plt

# plt.scatter(*zip(*documents_dict.items()));