# Create some Fake Data

This can be used to run some validations about the logic before getting real customer data

In [2]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta

from faker import Faker

Create files in local file system in CSV format just for referece, as we will be writing those into Snowflake tables later


In [1]:
def generate_app_logs (n_users, n_days_ago, random_state=0, faker_state = 0):
    
    np.random.seed(random_state)
    fake = Faker()
    Faker.seed(faker_state)
    today = pd.Timestamp(datetime.now().date())
    
    employee_list = []
    app_list = []
    
    for user in range(1, n_users):
        user_email = fake.ascii_company_email()
        department = fake.company()
        division = fake.job()
        title = fake.job()
        
        has_leaved_company = np.random.uniform (0,100)
        if (has_leaved_company < 5): #Generate 5% people leaving the company 
            last_day_work_days = np.random.uniform (0, n_days_ago)
            field_last_work_day = today - timedelta (days = day)
        else:
            last_day_work_days = 0
            field_last_work_day = today + timedelta (days = 365) # set to the future for now
        
        n_logings = int(np.random.uniform (0, n_days_ago - last_day_work_days))  # Between never logging and every day
        
        for l in range (0, n_logings):
            day = np.random.uniform(0, n_days_ago) # get a ramdom day
            
            ## Let´s introduce some users who do not log in thet last 30 days
            not_loggin = np.random.uniform (0,100)
            if (not_loggin < 25):
                login_day = today - timedelta (days = day + last_day_work_days + 50)
            else:
                login_day = today - timedelta (days = day + last_day_work_days)
            app_list.append([user_email, login_day])
            
        employee_list.append([user_email, department, division, title, field_last_work_day])
        
    df_employee = pd.DataFrame(employee_list, columns = ['session_user', 'department', 'division', 'title', 'last_day_of_work'])
    df_app = pd.DataFrame(app_list, columns = ['session_user', 'snapshot_datetime'])
    
    return df_app, df_employee
        
        

In [4]:

df_app_1, df_employee = generate_app_logs (n_users=5000, n_days_ago =365, random_state= 6, faker_state= 4)

In [5]:
df_app_1.to_csv('../../data/sample_okta_logs.csv', index=False )

In [6]:
df_employee.to_csv('../../data/sample_employee_metadata.csv', index=False)

In [7]:
df_app_2, k = generate_app_logs (n_users=5000, n_days_ago =365, random_state= 19, faker_state= 4)

In [8]:
df_app_2.to_csv('../../data/sample_app_logs.csv', index=False )

In [9]:
def generate_working_days (n_days_ago = 365):
    
    today = pd.Timestamp(datetime.now().date())
    
    calendar_list = []
    
    for d in range(0, n_days_ago):    
        calendar_day = today - timedelta (days = d)
                
        num = np.random.uniform (0,100)
        if (num < 20): # 20% of holidays
            is_working_day = 0
        else:
            is_working_day = 1
        
        calendar_list.append([calendar_day, is_working_day])
        
    
    df_cal = pd.DataFrame(calendar_list, columns = ['snapshot_datetime', 'work_day'])
    
    return df_cal    

In [10]:
df_cal = generate_working_days(n_days_ago = 365)

In [11]:
df_cal

Unnamed: 0,snapshot_datetime,work_day
0,2023-08-24,1
1,2023-08-23,1
2,2023-08-22,1
3,2023-08-21,1
4,2023-08-20,1
...,...,...
360,2022-08-29,1
361,2022-08-28,1
362,2022-08-27,0
363,2022-08-26,1


In [12]:
df_cal.to_csv('../../data/sample_work_days.csv', index=False)

Generate tables

In [13]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

In [14]:
with open('../../config/creds.json', 'r') as ff:
    conn_param=json.load(ff)

session = Session.builder.configs(conn_param).create() 

In [15]:
session.sql("create or replace database dev_snowpatrol").collect()
session.sql("create or replace schema main").collect()

session.use_database("dev_snowpatrol")
session.use_schema("dev_snowpatrol.main")

In [16]:
df = pd.read_csv('../../data/sample_okta_logs.csv')
df.columns = df.columns.str.upper()

table_name = 'SAMPLE_OKTA_LOGS'

session.write_pandas(df, table_name, auto_create_table=True)

<snowflake.snowpark.table.Table at 0x1d8e09d33d0>

In [17]:
df = pd.read_csv('../../data/sample_work_days.csv')
df.columns = df.columns.str.upper()

table_name = 'SAMPLE_WORK_DAYS'

session.write_pandas(df, table_name, auto_create_table=True)

<snowflake.snowpark.table.Table at 0x1d8e8fc5420>

In [18]:
df = pd.read_csv('../../data/sample_app_logs.csv')
df.columns = df.columns.str.upper()

table_name = 'SAMPLE_APP_LOGS'

session.write_pandas(df, table_name, auto_create_table=True)

<snowflake.snowpark.table.Table at 0x1d8e09d3cd0>

In [19]:
df = pd.read_csv('../../data/sample_employee_metadata.csv')
df.columns = df.columns.str.upper()

table_name = 'SAMPLE_EMPLOYEE_METADATA'

session.write_pandas(df, table_name, auto_create_table=True)

<snowflake.snowpark.table.Table at 0x1d8e1f43e20>