In [1]:
import pandas as pd
import glob
from datetime import date, timedelta
import numpy as np
from datetime import datetime
import pathlib
from pathlib import Path
from collections import OrderedDict
import polars as pl
import fastexcel
import os
import time

In [2]:
def convert_to_datetime(struct_time):
    """Convert struct_time to datetime object."""
    return datetime(*struct_time[:6])

def input_data(folder_path, sheet_name=None):
    file_paths = glob.glob(f"{folder_path}/*.xlsx") + glob.glob(f"{folder_path}/*.csv")
    df_list = []

    for file in file_paths:
        # Get last modified time and convert to datetime
        export_time = os.path.getmtime(file)
        export_time_datetime = convert_to_datetime(time.localtime(export_time))

        # Read file
        if file.endswith('.xlsx'):
            df = pl.read_excel(file, sheet_name=sheet_name)
        elif file.endswith('.csv'):
            try:
                df = pl.read_csv(file, encoding="utf-8")
            except:
                df = pl.read_csv(file, encoding="ISO-8859-1", ignore_errors=True)

        # Cast all columns to string
        df = df.with_columns([
            pl.col(col).cast(pl.String)
            for col in df.columns
        ])

        # Add file metadata columns
        df = df.with_columns([
            pl.lit(os.path.basename(file)).alias('File Name'),
            pl.lit(export_time_datetime).alias('Export Time')
        ])

        df_list.append(df)

    # Concatenate all dataframes
    if df_list:
        merged_df = pl.concat(df_list, how='vertical')
    else:
        merged_df = pl.DataFrame()

    return merged_df


today_temp = datetime.today().date()
today = today_temp.strftime('%b_%d_%Y')

In [3]:
first_glob_1 = "C:/Users/huuchinh.nguyen"
first_glob_2 = "C:/Users/ADMIN"

if os.path.exists(first_glob_1):
    first_glob = first_glob_1
elif os.path.exists(first_glob_2):
    first_glob = first_glob_2
else:
    raise FileNotFoundError(f"Neither {first_glob_1} nor {first_glob_2} exists.")

folder_paths = {
    "input_performance":f'{first_glob}/Concentrix Corporation/WFM-Expedia-HCM - Branding files/Rawdata/RETAIL_PERFORMANCE',
    "output_performance_combine":f'{first_glob}/Concentrix Corporation/WFM-Expedia-HCM - Branding files/Rawdata/OUTPUT_PERFORMANCE/OUTPUT_PERFORMANCE_COMBINE',
    "output_performance_compare":f'{first_glob}/Concentrix Corporation/WFM-Expedia-HCM - Branding files/Rawdata/OUTPUT_PERFORMANCE/OUTPUT_PERFORMANCE_COMPARE',
    "hc_extend_by_month":f'{first_glob}/Concentrix Corporation/WFM-Expedia-HCM - Branding files/Headcount/HC Extend by Month'
}

In [4]:
PERFORMANCE_INPUT = input_data(folder_paths["input_performance"])

existing_cols = set(PERFORMANCE_INPUT.columns)

columns_to_cast = {
    "NPS Raw Score": pl.Int64,
    "Submitted Date": pl.Date,
    "Started Time": pl.Datetime,
    "Joined Time": pl.Datetime,
    "Submitted Time": pl.Datetime,
    "Left Time": pl.Datetime,
    "Handle Time (Sum)": pl.Float64,
    "Hold Time (Sum)": pl.Float64,
    "Talk Time (Sum)": pl.Float64,
    "Wrap Up Time (Sum)": pl.Float64,
    "CCR15": pl.Int64,
    "CCR24": pl.Int64,
    "CCR48": pl.Int64,
    "CCR72": pl.Int64,
    "Survey Submitted (Count)": pl.Int64,
    "Response Time (Sum)": pl.Float64,
    "Response Time (Avg)": pl.Float64,
    "Concurrency": pl.Float64,
}

datetime_cols = ["Started Time", "Joined Time", "Submitted Time", "Left Time"]

casts = []

for col, dtype in columns_to_cast.items():
    if col in existing_cols:
        if col == "Submitted Date":
            casts.append(pl.col(col).str.strptime(pl.Date, "%m/%d/%Y", strict=False))
        elif col in datetime_cols:
            casts.append(
                pl.when(pl.col(col).str.contains(r"^\d{4}-\d{2}-\d{2}"))  # ISO: YYYY-MM-DD
                  .then(pl.col(col).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False))
                .when(pl.col(col).str.contains(r"^\d{1,2}/\d{1,2}/\d{4}"))  # US: M/D/YYYY
                  .then(pl.col(col).str.strptime(pl.Datetime, "%m/%d/%Y %H:%M", strict=False))
                .otherwise(None)
                .alias(col)
            )
        elif dtype == pl.Float64:
            casts.append(pl.col(col).str.replace_all(",", "").cast(pl.Float64))
        else:
            casts.append(pl.col(col).cast(dtype))

PERFORMANCE_CHANGED_TYPE = PERFORMANCE_INPUT.with_columns(casts)

PERFORMANCE_NEXT_STEP = PERFORMANCE_CHANGED_TYPE.with_columns([
    pl.col("Joined Time").dt.date().alias("Joined Date")
])

PERFORMANCE_NEXT_STEP = PERFORMANCE_NEXT_STEP.with_columns([
    (pl.col("Joined Time") + pl.duration(hours=14)).alias("Join Time (VNT)"),
    (pl.col("Joined Time") + pl.duration(hours=14)).dt.date().alias("Join Date (VNT)")
])

HC_MASTER_DATABASE = input_data(folder_paths["hc_extend_by_month"])
HC_MASTER_DATABASE = HC_MASTER_DATABASE.rename({'Date Start Week': 'Week_Monday'})

HC_MASTER_DATABASE = HC_MASTER_DATABASE.with_columns([
    pl.col('Date').str.strptime(pl.Date, "%Y-%m-%d", strict=False)
])
print(HC_MASTER_DATABASE.columns)
hc_master_selected = HC_MASTER_DATABASE.select([
    "Date","Email Id", "OracleID", "People ID", "IEX ID", "Employee Name", "Alias", "Designation", 
    "Supervisor Name", "Wave", "LOB", 'LG Tenure', 'NL Tenure', "AON", 'Mini TL - Email', 'Mini TL - Short Name', 'Mini TL Start Date'
]).unique()
hc_master_selected = hc_master_selected.rename({'Mini TL - Short Name': 'Mini TL'})
performance_merged = PERFORMANCE_NEXT_STEP.join(
    hc_master_selected,
    left_on=["Joined Date","Agent Email ID"],
    right_on=["Date","Email Id"],
    how="left")

Could not determine dtype for column 63, falling back to string
Could not determine dtype for column 64, falling back to string
Could not determine dtype for column 65, falling back to string
Could not determine dtype for column 63, falling back to string
Could not determine dtype for column 64, falling back to string
Could not determine dtype for column 65, falling back to string
Could not determine dtype for column 46, falling back to string
Could not determine dtype for column 48, falling back to string
Could not determine dtype for column 63, falling back to string
Could not determine dtype for column 64, falling back to string
Could not determine dtype for column 65, falling back to string
Could not determine dtype for column 63, falling back to string
Could not determine dtype for column 64, falling back to string
Could not determine dtype for column 65, falling back to string
Could not determine dtype for column 48, falling back to string
Could not determine dtype for column 63,

['Year', 'Month', 'Site', 'Queue Group', 'Week Begin', 'Date Start Month', 'Date End Month', 'Week_Monday', 'Date End Week', 'Week', 'Day', 'Date', 'OracleID', 'People ID', 'IEX ID', 'Employee Name', 'Alias', 'Gender', 'Designation', 'Grade', 'LOB', 'Role', 'Multiple Chat Effective Date', 'Primary role', 'Secondary role', 'TL ID', 'Supervisor Name', 'Manager/OM Name', 'Email Id', 'Wave', 'CCT Training', 'Lodging Training', 'Lodging Nesting', 'Lodging Extended Nesting', 'CSG Joining Date', 'Lodging Production', 'NonLodging Training', 'NonLodging Nesting', 'NonLodging Extended Nesting', 'NonLodging Production', 'Production', 'LWD/Movement', 'Reason for Attrition', 'Attrition Type', 'ATT_Reason_LV3', 'Detail Status', 'AON', 'LG Tenure', 'NL Tenure', 'Concurrency', 'Status', 'LOB_2', 'LOB_3', 'Performance_Calculation', 'Stage', 'HC Open By Week', 'HC Closed By Week', 'HC Open By Month', 'HC Closed By Month', 'ATT/Movement', 'ATT', 'Movement', 'CSG', 'Mini TL - Email', 'Mini TL - Short Name

In [5]:
performance_processed = performance_merged.with_columns([
    pl.when(
        (pl.col("Handle Time (Sum)") >= 1500) & (pl.col("LOB") == "Lodging")
        | (pl.col("Handle Time (Sum)") >= 2700) & (pl.col("LOB") == "Non_Lodging")
    )
    .then(1)
    .otherwise(0)
    .alias("_lc"),
    
    pl.when(pl.col("Initiated Outbound (Yes / No)") == "Yes")
      .then(1)
      .otherwise(0)
      .alias("_aob"),

    pl.when(pl.col("NPS Raw Score").is_in([9, 10]))
      .then(1)
      .otherwise(0)
      .alias("_promoter"),

    pl.when((pl.col("NPS Raw Score").is_in([0,1,2,3,4,5,6])) & (pl.col("NPS Raw Score") <= 6))
      .then(1)
      .otherwise(0)
      .alias("_detractor"),

    pl.when(pl.col("NPS Raw Score").is_in([7,8]))
      .then(1)
      .otherwise(0)
      .alias("_neutral"),

    pl.col("Survey Submitted (Count)").alias("_survey"),
    pl.col("CCR72").alias("_fup_72"),
    (1 - pl.col("CCR72")).alias("_rr"),

    pl.col("Joined Time").dt.date().alias("_PST.Date"),
    pl.col("Joined Time").dt.strftime("%y_%m").alias("_PST.Month"),
    pl.col("Joined Time").dt.week().alias("_PST.Week"),
    pl.col("Joined Time").dt.year().alias("_PST.Year"),
    pl.concat_str([
        pl.col("Agent Email ID").cast(pl.Utf8),
        pl.col("Conversation Id").cast(pl.Utf8),
        pl.col("Joined Time").dt.strftime("%Y-%m-%d %H:%M:%S")
    ]).alias("_conver_unique"),
    pl.when(pl.col("LOB").is_in(["Non_Lodging","Lodging"]))
      .then(pl.lit("agent"))
      .otherwise(0)
      .alias("Agent"),
    pl.col("Joined Date").alias("_Date"),
    pl.col("Participant Joined Half Hour Interval").alias("Interval Joined"),
    pl.when(pl.col("LOB") == "Lodging")
      .then(pl.col("LG Tenure"))
      .when(pl.col("LOB") == "Non_Lodging")
      .then(pl.col("NL Tenure"))
      .otherwise(None)
      .alias("Tenure"),
    pl.when(
        pl.col("Agent Queue Group Name").is_in(["Chat_OD_EN_Lodging", "Chat_OD_EN_Car_Activity"])
    )
    .then(pl.lit("LG Chat"))
    .when(pl.col("Agent Queue Group Name") == "Chat_OD_EN_Dual_GDS")
    .then(pl.lit("NL Chat"))
    .otherwise(None)
    .alias("LOB")
])

performance_processed = performance_processed.with_columns([
    pl.when(pl.col("_promoter") == 1)
      .then(pl.lit("promoter"))
    .when(pl.col("_detractor") == 1)
      .then(pl.lit("detractor"))
    .when(pl.col("_neutral") == 1)
      .then(pl.lit("neutral"))
    .otherwise(pl.lit("no_survey"))
    .alias("_nps_type")
])

print(performance_processed.columns)

selected_columns = [
    "Export Time","File Name","Latest VA Product", "Language", "Latest VA Intent", "Conversation Id",
    "NPS Raw Score", "Interval Joined", "Answer", "Customer Engagement",
    "Agent Queue Group Name", "Started Time", "Joined Time", "Agent Email ID",
    "Requeued (Yes / No)", "Question Category", "Submitted Date", "Ended Time",
    "Left Time", "Handle Time (Sum)", "Hold Time (Sum)", "Talk Time (Sum)",
    "Wrap Up Time (Sum)", "Response Time (Sum)", "Response Time (Avg)",
    "CCR72", "_PST.Date", "_PST.Month", "_PST.Year", "_aob", "LOB",
    "_conver_unique", "_nps_type", "_promoter", "_detractor", "_neutral",
    "_survey", "_PST.Week", "_lc", "AON", "Tenure", "OracleID", "People ID",
    "IEX ID", "Employee Name", "Alias", "Designation", "Supervisor Name",
    "Wave", "_Date", "Agent","Mini TL - Email", "Mini TL", "Mini TL Start Date"
]

performance_filtered = performance_processed.select(selected_columns)
performance_filtered = performance_filtered.unique()

['Agent Vendor Location', 'Latest VA Product', 'Language', 'Queue Name', 'Channel Type', 'Latest VA Intent', 'Conversation Id', 'Initiated Outbound (Yes / No)', 'NPS Raw Score', 'Channel Address', 'Submitted Date', 'Has Followup Agent Assisted (Yes / No)', 'Answer', 'Customer Engagement', 'Agent Queue Group Name', 'Has Followup Within 72 Hours (Yes / No)', 'Started Time', 'Joined Time', 'Agent Email ID', 'Requeued (Yes / No)', 'Question Category', 'Submitted Time', 'Ended Time', 'Left Time', 'Completed (Yes/No)', 'Close Reason', 'Chat Agent First Response Time', 'Participant Joined Half Hour Interval', 'Handle Time (Sum)', 'Hold Time (Sum)', 'Talk Time (Sum)', 'Wrap Up Time (Sum)', 'Conversation Agent Disconnect', 'Conversation System Disconnect', 'Short Conversation Count', 'CCR15', 'CCR24', 'Survey Submitted (Count)', 'Response Time (Sum)', 'Response Time (Avg)', 'CCR48', 'CCR72', 'Concurrency', 'File Name', 'Export Time', 'Joined Date', 'Join Time (VNT)', 'Join Date (VNT)', 'OracleI

In [6]:
performance_unique = performance_filtered.filter(
    pl.col("File Name").str.replace(r"\.csv$", "").str.contains(r"^25_\d{2}$")
)

unique_file_names_list_1 = (performance_unique.select("File Name").unique().to_series().to_list())
joined_time_stats = performance_unique.select([pl.col("Joined Time").min().alias("Joined_Time_Min"),pl.col("Joined Time").max().alias("Joined_Time_Max")])
print(unique_file_names_list_1)
print(joined_time_stats)

performance_unique = performance_unique.drop(["Export Time", "File Name"])
performance_unique = performance_unique.unique()

for month, group in performance_unique.group_by('_PST.Month'):
    month_value = month[0]
    file_name = f"{month_value}.csv"
    file_path = os.path.join(folder_paths["output_performance_combine"], file_name)
    
    group.write_csv(file_path)

['25_04.csv', '25_06.csv', '25_05.csv', '25_02.csv', '25_03.csv']
shape: (1, 2)
┌─────────────────────┬─────────────────────┐
│ Joined_Time_Min     ┆ Joined_Time_Max     │
│ ---                 ┆ ---                 │
│ datetime[μs]        ┆ datetime[μs]        │
╞═════════════════════╪═════════════════════╡
│ 2025-02-01 00:00:22 ┆ 2025-06-16 17:59:48 │
└─────────────────────┴─────────────────────┘


In [7]:
performance_duplicate = performance_filtered.filter(pl.col("File Name").str.count_matches("_").eq(2))

unique_file_names_list_2 = (performance_duplicate.select("File Name").unique().to_series().to_list())
print(unique_file_names_list_2)

performance_duplicate = performance_filtered.with_columns([
    pl.col("Export Time").dt.date().alias("Export Date")
])

grouped = performance_duplicate.group_by(["Export Date", "_PST.Month"])

for (export_date, month_value), group in grouped:
    export_date_str = export_date.strftime("%Y-%m-%d")
    file_name = f"{export_date_str}_{month_value}.csv"
    file_path = os.path.join(folder_paths["output_performance_compare"], file_name)
    group.write_csv(file_path)

['25_06_1.csv', '25_06_2.csv']
