In [1]:
# IMPORTS
import pandas as pd
import numpy as np
import random
from faker import Faker
import os

In [2]:
if not os.path.exists("./Data/transactions_data.csv"):
    # Creating fake data to work with Faker library
    fake = Faker()

    # Create a DF of fake e-commerce transactions
    num_transactions = 10000

    clients = [fake.name() for _ in range(num_transactions // 2)]
    products_names = ['Produto A', 'Produto B', 'Produto C', 'Produto D', 'Produto E', 'Produto F', None]
    categories = ['Calçados', 'Acessórios', 'Roupas', None]

    # products = [{'Product_ID': i,
    #         'Name': random.choice(products_names),
    #         'Category': random.choice(categories),
    #         'UnitPrice': round(random.uniform(10, 100), 2)} for i in range(len(products_names))]

    # Generate transactions
    data = [{'Transaction_ID': i,
            'Date': fake.date_time_this_decade(),
            'Client': random.choice(clients),
            'Product': random.choice(products_names),
            'Category': random.choice(categories),
            'Quantity': random.randint(1, 5),
            'UnitPrice': round(random.uniform(10, 100), 2)} for i in range(num_transactions)]

    transactions_data = pd.DataFrame(data)

    # Export transactions to csv 
    transactions_data.to_csv("./Data/transactions_data.csv", index=False)
    
    del(fake)
    del(num_transactions)
    del(products_names)
    del(categories)
    del(data)
    del(transactions_data)

In [3]:
transactions = pd.read_csv("./Data/transactions_data.csv")

transactions.describe(include='all')

Unnamed: 0,Transaction_ID,Date,Client,Product,Category,Quantity,UnitPrice
count,10000.0,10000,10000,8646,7494,10000.0,10000.0
unique,,10000,4222,6,3,,
top,,2023-07-19 04:46:06,Robert Jones,Produto C,Acessórios,,
freq,,1,9,1480,2521,,
mean,4999.5,,,,,2.9865,54.947379
std,2886.89568,,,,,1.401683,26.061132
min,0.0,,,,,1.0,10.01
25%,2499.75,,,,,2.0,32.37
50%,4999.5,,,,,3.0,54.805
75%,7499.25,,,,,4.0,77.6425


In [4]:
# Remove NaN values
transactions.dropna()

# Transform date column to datetime
transactions['Date'] = pd.to_datetime(transactions['Date'])

# Create Income series
transactions['Income'] = transactions['Quantity'] * transactions['UnitPrice']



In [5]:
categories = transactions.groupby('Category')

print(categories['Income'].sum())

print(categories.head())

Category
Acessórios    401790.90
Calçados      416018.06
Roupas        411626.19
Name: Income, dtype: float64
    Transaction_ID                Date                Client    Product  \
0                0 2023-07-19 04:46:06        Duane Hamilton  Produto E   
1                1 2020-05-18 02:42:17           John Butler  Produto E   
2                2 2024-01-02 07:14:55          Kari Wilkins  Produto C   
5                5 2022-12-29 07:29:48     Michael Dominguez        NaN   
6                6 2021-02-04 10:31:07  Christopher Shepherd  Produto B   
9                9 2022-12-17 05:02:36      Brenda Fernandez  Produto F   
10              10 2022-05-01 15:07:32          Wendy Parker  Produto E   
11              11 2023-05-12 13:02:56         Mathew Bryant  Produto E   
12              12 2021-07-03 18:28:43         Wendy Frazier        NaN   
13              13 2020-02-25 23:41:37        Samantha Adams  Produto E   
14              14 2022-05-11 19:43:05          Johnny Greer  Pro