In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import random
import time
from datetime import datetime
from numpy.random import choice
from faker import Faker

In [37]:
df = pd.DataFrame(columns=['OrderID','Startdate', 'Enddate',
                           'Priority', 'Status', 'ActLaborHrs',
                           'Group', 'SubGroup', 'Responsedays', 'Responsehours'])
df.head()

Unnamed: 0,OrderID,Startdate,Enddate,Priority,Status,ActLaborHrs,Group,SubGroup,Responsedays,Responsehours


### Generate Orders by Timestamp

#### All generated orders follow business days (Mon-Fri) and hours (8am-6pm)

In [38]:
def randomDate(start, end):
    '''
    input: setup start and end date 
    output: one random date between the range(start, end) and follows business days/ hours
    '''
    from datetime import datetime
    frmt = '%Y-%m-%d %H:%M:%S'
    busday = False 
    start_time = time.mktime(time.strptime(start, frmt))
    end_time = time.mktime(time.strptime(end, frmt))
    
    while busday == False:
        random_time = start_time + random.random() * (end_time - start_time)
        dt = datetime.fromtimestamp(time.mktime(time.localtime(random_time)))
        busday = businessday(dt)
    return dt

In [62]:
def businessday(dt):
    '''
    input: any 'time' data
    output: a boolean value shows if the input follow the business days/ hours
    '''
    if dt.weekday() == 5 or dt.weekday() == 6: #Mon - Fri
        return False
    else:
        if dt.hour <= 8 or dt.hour >= 18: #8am - 6pm
            return False
        else:
            return True

In [40]:
def get_start_date(start, end, amount):
    '''
    This function conclude above two
    input: setup start and end date with form(%Y-%m-%d %H:%M:%S) ; how many random date you need
    output: a list of random dates between the range(start, end) and follow business days/ hours
    '''
    start_date_list = []
    for i in range(0,amount):
        start_date = randomDate(start, end)
        start_date_list.append(str(start_date))
    return start_date_list

In [41]:
def get_end_date(start, duration):
    '''
    input: one or a list of order start date
    output: a list of order end date by random 1 ~ duration
    '''
    frmt = '%Y-%m-%d %H:%M:%S'
    end_date_list = []
    
    for i in range(0, len(start)):
        busday = False
        start_time = time.mktime(time.strptime(start[i], frmt))
        while busday == False:
            random_diff = start_time + random.randrange(1, duration, 6) * 3600 #by hours
            dt = datetime.fromtimestamp(time.mktime(time.localtime(random_diff)))
            busday = businessday(dt)
        end_date_list.append(str(dt))
    return end_date_list 

#### Oder Startdate:

In [42]:
random_start = get_start_date('2018-01-01 00:00:00', '2019-12-31 00:00:00', 20000)
df['Startdate'] = random_start

#### Order EndDate:

In [43]:
df['Enddate'] = get_end_date(random_start, 100)

In [44]:
df.head(5)

Unnamed: 0,OrderID,Startdate,Enddate,Priority,Status,ActLaborHrs,Group,SubGroup,Responsedays,Responsehours
0,,2019-04-25 12:49:25,2019-04-29 13:49:25,,,,,,,
1,,2019-10-04 14:36:17,2019-10-04 15:36:17,,,,,,,
2,,2019-12-09 16:58:34,2019-12-12 11:58:34,,,,,,,
3,,2019-12-27 16:24:07,2019-12-31 11:24:07,,,,,,,
4,,2019-01-08 13:59:11,2019-01-11 14:59:11,,,,,,,


In [72]:
df['Startdate'].tail(5)

19995    2018-02-23 13:32:02
19996    2019-06-19 10:27:38
19997    2018-08-02 13:27:56
19998    2018-12-18 09:51:22
19999    2019-03-14 11:02:01
Name: Startdate, dtype: object

### Priority

In [76]:
df['Priority'] = np.random.choice([1, 2, 11]) #df[0]

In [77]:
df['Priority'].value_counts()

2    20000
Name: Priority, dtype: int64

In [93]:
dfupdate = df[df['Priority'] == 2 ].sample(2973)

In [94]:
dfupdate['Priority'] = 11
df.update(dfupdate)

In [95]:
df['Priority'].value_counts()


11.0    17838
2.0      2162
Name: Priority, dtype: int64

### ActLaborHrs

In [137]:
df['ActLaborHrs'] = np.random.choice([ 0, 1, 2, 3])

In [138]:
dfupdate=df.sample(12973)
dfupdate['ActLaborHrs'] = np.random.uniform(0,10)
df.update(dfupdate)

In [139]:
dfupdate=df.sample(1000)
dfupdate['ActLaborHrs'] = np.random.uniform(10,600)
df.update(dfupdate)

In [140]:
dfupdate=df.sample(100)
dfupdate['ActLaborHrs'] = np.random.uniform(100,4000)
df.update(dfupdate)

In [141]:
dfupdate=df.sample(3000)
dfupdate['ActLaborHrs'] = 0.5
df.update(dfupdate)

In [142]:
df['ActLaborHrs'].describe()

count    20000.000000
mean        21.536267
std         89.462689
min          0.000000
25%          0.000000
50%          7.056641
75%          7.056641
max        968.829141
Name: ActLaborHrs, dtype: float64

In [143]:
df['ActLaborHrs'].head(5)

0    0.500000
1    7.056641
2    0.000000
3    0.000000
4    0.000000
Name: ActLaborHrs, dtype: float64

### OrderID

In [145]:
dfN = df

In [146]:
dfN['OrderID'] = random.sample(range(100000,700000), 20000)

In [147]:
import random
from random import randint

In [148]:
dfN['OrderID'].count()

20000

In [149]:
dfN['OrderID'].duplicated().sum()

0

In [150]:
dfN['OrderID'] = 'WO' + dfN['OrderID'].astype(str)

In [151]:
dfN['OrderID'] .head(20)

0     WO644462
1     WO135407
2     WO618754
3     WO593796
4     WO681395
5     WO383647
6     WO679171
7     WO324891
8     WO533851
9     WO132321
10    WO116671
11    WO537004
12    WO186654
13    WO204799
14    WO368510
15    WO442504
16    WO208634
17    WO213519
18    WO565281
19    WO199312
Name: OrderID, dtype: object

In [152]:
dfN 

Unnamed: 0,OrderID,Startdate,Enddate,Priority,Status,ActLaborHrs,Group,SubGroup,Responsedays,Responsehours,Unnamed: 11
0,WO644462,2019-04-25 12:49:25,2019-04-29 13:49:25,11.0,Complete,0.500000,Engineering,,,,EHS
1,WO135407,2019-10-04 14:36:17,2019-10-04 15:36:17,11.0,Complete,7.056641,HVAC,,,,EHS
2,WO618754,2019-12-09 16:58:34,2019-12-12 11:58:34,11.0,Onhold,0.000000,Engineering,,,,EHS
3,WO593796,2019-12-27 16:24:07,2019-12-31 11:24:07,11.0,Complete,0.000000,Locksmith,,,,EHS
4,WO681395,2019-01-08 13:59:11,2019-01-11 14:59:11,11.0,Complete,0.000000,Engineering,,,,EHS
...,...,...,...,...,...,...,...,...,...,...,...
19995,WO433559,2018-02-23 13:32:02,2018-02-23 14:32:02,11.0,Complete,7.056641,Engineering,,,,EHS
19996,WO437397,2019-06-19 10:27:38,2019-06-21 17:27:38,11.0,Complete,7.056641,Cleaning,,,,EHS
19997,WO488684,2018-08-02 13:27:56,2018-08-02 14:27:56,2.0,Complete,7.056641,Engineering,,,,EHS
19998,WO197340,2018-12-18 09:51:22,2018-12-21 10:51:22,11.0,Complete,327.869059,Building Maintenance,,,,EHS


### Group&SubGroup

In [153]:
def get_employee_names(people):
    l = []
    for i in range(0, people):
        n = fake.name()
        l.append(n)
    return l

In [154]:
group_list = ['EHS', 'Engineering', 'HVAC', 'Electrical',
              'Fire safety', 'Plumber', 'Locksmith', 'CallCenter', 'Landscape service',
              'Security', 'Cleaning', 'Energy', 'Building Maintenance',
              'IT', 'Managment']
group_weight = [0.005, 0.15, 0.13, 0.03,
                0.09, 0.08, 0.10, 0.08, 0.003,
                0.03, 0.09, 0.07, 0.133,
                0.005, 0.004]

In [155]:
sum(group_weight)

1.0

In [156]:
len(group_list)

15

In [157]:
dfN['Group'] =  choice(group_list, 20000, p = group_weight)

In [158]:
dfN['Group'].value_counts()

Engineering             3062
Building Maintenance    2678
HVAC                    2561
Locksmith               1965
Fire safety             1817
Cleaning                1793
CallCenter              1647
Plumber                 1611
Energy                  1335
Security                 595
Electrical               586
IT                       100
EHS                       96
Managment                 93
Landscape service         61
Name: Group, dtype: int64

In [159]:
#dfN.to_csv('fake_data1.csv')

In [160]:
dfN

Unnamed: 0,OrderID,Startdate,Enddate,Priority,Status,ActLaborHrs,Group,SubGroup,Responsedays,Responsehours,Unnamed: 11
0,WO644462,2019-04-25 12:49:25,2019-04-29 13:49:25,11.0,Complete,0.500000,Plumber,,,,EHS
1,WO135407,2019-10-04 14:36:17,2019-10-04 15:36:17,11.0,Complete,7.056641,Plumber,,,,EHS
2,WO618754,2019-12-09 16:58:34,2019-12-12 11:58:34,11.0,Onhold,0.000000,CallCenter,,,,EHS
3,WO593796,2019-12-27 16:24:07,2019-12-31 11:24:07,11.0,Complete,0.000000,Engineering,,,,EHS
4,WO681395,2019-01-08 13:59:11,2019-01-11 14:59:11,11.0,Complete,0.000000,Cleaning,,,,EHS
...,...,...,...,...,...,...,...,...,...,...,...
19995,WO433559,2018-02-23 13:32:02,2018-02-23 14:32:02,11.0,Complete,7.056641,Locksmith,,,,EHS
19996,WO437397,2019-06-19 10:27:38,2019-06-21 17:27:38,11.0,Complete,7.056641,Fire safety,,,,EHS
19997,WO488684,2018-08-02 13:27:56,2018-08-02 14:27:56,2.0,Complete,7.056641,Building Maintenance,,,,EHS
19998,WO197340,2018-12-18 09:51:22,2018-12-21 10:51:22,11.0,Complete,327.869059,Security,,,,EHS


### Status

In [161]:
status_list = ['Cancel', 'Inprogress', 'Complete', 'Onhold']
status_weight = [0.06, 0.15, 0.74, 0.05]

In [162]:
sum(status_weight)

1.0

In [163]:
dfN['Status'] = choice(status_list, 20000, p = status_weight)

In [164]:
dfN['Status'].value_counts()

Complete      14765
Inprogress     3044
Cancel         1173
Onhold         1018
Name: Status, dtype: int64

In [165]:
dfN.head(10)

Unnamed: 0,OrderID,Startdate,Enddate,Priority,Status,ActLaborHrs,Group,SubGroup,Responsedays,Responsehours,Unnamed: 11
0,WO644462,2019-04-25 12:49:25,2019-04-29 13:49:25,11.0,Complete,0.5,Plumber,,,,EHS
1,WO135407,2019-10-04 14:36:17,2019-10-04 15:36:17,11.0,Onhold,7.056641,Plumber,,,,EHS
2,WO618754,2019-12-09 16:58:34,2019-12-12 11:58:34,11.0,Complete,0.0,CallCenter,,,,EHS
3,WO593796,2019-12-27 16:24:07,2019-12-31 11:24:07,11.0,Complete,0.0,Engineering,,,,EHS
4,WO681395,2019-01-08 13:59:11,2019-01-11 14:59:11,11.0,Complete,0.0,Cleaning,,,,EHS
5,WO383647,2018-04-30 13:28:41,2018-05-01 14:28:41,11.0,Onhold,0.0,Building Maintenance,,,,EHS
6,WO679171,2018-08-13 11:30:59,2018-08-14 12:30:59,11.0,Complete,0.0,Locksmith,,,,EHS
7,WO324891,2019-05-30 16:46:02,2019-05-30 17:46:02,11.0,Complete,7.056641,CallCenter,,,,EHS
8,WO533851,2018-08-03 10:08:29,2018-08-07 11:08:29,11.0,Complete,7.056641,HVAC,,,,EHS
9,WO132321,2018-09-21 11:02:07,2018-09-25 12:02:07,11.0,Complete,0.5,Building Maintenance,,,,EHS
