In [8]:
import pandas as pd
import os
from datetime import datetime
import re
from collections import defaultdict, Counter

In [9]:
def parseTransactions(line):
    parts = line.strip().split(',')
    staff_id = int(parts[0])
    timestamp = parts[1]   
    #try-except block to handle value error
    try:
        transaction_time = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S')
    except ValueError:
        transaction_time = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M')
    
    products = re.findall(r'(\d+):(\d+)', parts[2])
    sale_amount = float(parts[3])
    return staff_id, transaction_time, products, sale_amount
#function to help with processing files and calculating metrics
def process_transactions(folder_path):
    daily_sales_volume = defaultdict(int)
    daily_sales_value = defaultdict(float)
    product_sales = Counter()
    monthly_sales_staff = defaultdict(lambda: defaultdict(float))
    hourly_transaction_volume = defaultdict(list)

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                for line in file:
                    staff_id, transaction_time, products, sale_amount = parseTransactions(line)
                    
                    date = transaction_time.date()                    #aggregation for daily metrics
                    daily_sales_volume[date] += sum(int(qty) for _, qty in products)
                    daily_sales_value[date] += sale_amount
                    
                    for product_id, qty in products:                    #aggregation for products
                        product_sales[product_id] += int(qty)
                    
                    month = transaction_time.strftime('%Y-%m')      #aggregation for monthly metrics
                    monthly_sales_staff[month][staff_id] += sale_amount
                                       
                    hour = transaction_time.hour           #aggregation for hourly metrics
                    hourly_transaction_volume[hour].append(sum(int(qty) for _, qty in products))
    
    top_sales_volume_day = max(daily_sales_volume.items(), key=lambda x: x[1])    #metrics calculation
    top_sales_value_day = max(daily_sales_value.items(), key=lambda x: x[1])
    most_sold_product = product_sales.most_common(1)[0]
    top_sales_staff_per_month = {month: max(staff_sales.items(), key=lambda x: x[1]) for month, staff_sales in monthly_sales_staff.items()}
    top_hour_by_avg_volume = max(hourly_transaction_volume.items(), key=lambda x: sum(x[1]) / len(x[1]))
    
    return {
        'top_sales_volume_day': {
            'date': top_sales_volume_day[0].isoformat(),
            'total_volume': top_sales_volume_day[1]
        },
        'top_sales_value_day': {
            'date': top_sales_value_day[0].isoformat(),
            'total_value': top_sales_value_day[1]
        },
        'most_sold_product': {
            'product_id': most_sold_product[0],
            'total_quantity': most_sold_product[1]
        },
        'top_sales_staff_per_month': {
            month: {
                'staff_id': staff[0],
                'total_sales': staff[1]
            } for month, staff in top_sales_staff_per_month.items()
        },
        'highest_hour_by_avg_volume': {
            'hour': top_hour_by_avg_volume[0],
            'average_volume': sum(top_hour_by_avg_volume[1]) / len(top_hour_by_avg_volume[1])
        }
    }

In [10]:
# Example usage
folder_path = r'C:\Users\HP\Documents\monie-hackathon\data'
results = process_transactions(folder_path)
print(results)

{'top_sales_volume_day': {'date': '2025-11-01', 'total_volume': 50972}, 'top_sales_value_day': {'date': '2025-11-01', 'total_value': 26105658.268}, 'most_sold_product': {'product_id': '469117', 'total_quantity': 9924}, 'top_sales_staff_per_month': {'2025-01': {'staff_id': 9, 'total_sales': 49261442.96800001}, '2025-02': {'staff_id': 9, 'total_sales': 44725439.46299994}, '2025-03': {'staff_id': 8, 'total_sales': 53751438.521999955}, '2025-04': {'staff_id': 8, 'total_sales': 47915445.77599999}, '2025-06': {'staff_id': 9, 'total_sales': 49567064.772000104}, '2025-07': {'staff_id': 3, 'total_sales': 56446945.54600001}, '2025-08': {'staff_id': 7, 'total_sales': 48037902.4629999}, '2025-09': {'staff_id': 8, 'total_sales': 45703479.36900009}, '2025-10': {'staff_id': 9, 'total_sales': 51121489.026}, '2025-11': {'staff_id': 5, 'total_sales': 49244695.534000084}, '2025-12': {'staff_id': 3, 'total_sales': 52325101.67700001}}, 'highest_hour_by_avg_volume': {'hour': 11, 'average_volume': 50.0018597