In [117]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from datetime import datetime
cur_dir = Path(os.getcwd())
print(cur_dir)



# Target directory
target_dir = cur_dir.parent / 'data_science_folder/cleaned'

print(target_dir)

/Users/lucazosso/Desktop/IE_Course/weclomeback/welcomeback_dev/evaluation
/Users/lucazosso/Desktop/IE_Course/weclomeback/welcomeback_dev/data_science_folder/cleaned


# Evaluation of RAG System

In [118]:
hourly_orders_vanilla = pd.read_csv(os.path.join(target_dir, 'hourly_orders_detailed_vanilla.csv'))
rfm = pd.read_csv(os.path.join(target_dir, 'rfm_data.csv'))

results = {}

In [119]:
hourly_orders_vanilla.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25019 entries, 0 to 25018
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   date                           25019 non-null  object 
 1   order_id                       25019 non-null  int64  
 2   postcode                       24997 non-null  object 
 3   day_of_week                    25019 non-null  object 
 4   tot_orders                     25019 non-null  int64  
 5   tot_gross_revenue              25019 non-null  float64
 6   tot_cogs                       24905 non-null  float64
 7   tot_orders_marketplace         25019 non-null  int64  
 8   tot_orders_int                 25019 non-null  int64  
 9   tot_orders_b2b                 25019 non-null  int64  
 10  tot_gross_revenue_mrktpl       25019 non-null  float64
 11  tot_gross_revenue_int          25019 non-null  float64
 12  tot_gross_revenue_b2b          25019 non-null 

In [120]:
# Parse date to a datetime object
hourly_orders_vanilla['date'] = pd.to_datetime(hourly_orders_vanilla['date'])

In [121]:
# extract date and month from datetime
hourly_orders_vanilla['date'] = pd.to_datetime(hourly_orders_vanilla['date']).dt.date
hourly_orders_vanilla['month'] = pd.to_datetime(hourly_orders_vanilla['date']).dt.month
hourly_orders_vanilla['year'] = pd.to_datetime(hourly_orders_vanilla['date']).dt.year

In [122]:
# RFM info
rfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9781 entries, 0 to 9780
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      9781 non-null   int64  
 1   recency          9781 non-null   int64  
 2   frequency        9781 non-null   float64
 3   monetary         9781 non-null   float64
 4   recency_score    9781 non-null   int64  
 5   frequency_score  9781 non-null   int64  
 6   monetary_score   9781 non-null   int64  
 7   RFM_Score        9781 non-null   int64  
 8   RFM_Segment      9781 non-null   object 
dtypes: float64(2), int64(6), object(1)
memory usage: 687.9+ KB


In [123]:
questions = [
    "Retrieve the total gross revenue for each day of the week",

    "What are my number of unique customers placing orders each month?",

    "What are my average order value for each type of order channel?",

    "List the top 5 zip codes by total number of orders",

    "List my average order value per month",

    "What is the average gross revenue by customer segments?",

    "What is the correaltion between frequency and monetary scores?",

    "What are the total number of orders placed during weekends versus weekdays?",

    "What is the percentage of total revenue made by B2B orders?",

    "Give me a summary of the last complete month's data across all order channels."
]

In [124]:
# put the questions as keys in the dictionary
for question in questions:
    results[question] = None
    
print(results.keys())

dict_keys(['Retrieve the total gross revenue for each day of the week', 'What are my number of unique customers placing orders each month?', 'What are my average order value for each type of order channel?', 'List the top 5 zip codes by total number of orders', 'List my average order value per month', 'What is the average gross revenue by customer segments?', 'What is the correaltion between frequency and monetary scores?', 'What are the total number of orders placed during weekends versus weekdays?', 'What is the percentage of total revenue made by B2B orders?', "Give me a summary of the last complete month's data across all order channels."])


__Q1: Total gross revenue for each day of the week:__

In [125]:
results[questions[0]] = hourly_orders_vanilla.groupby('day_of_week')['tot_gross_revenue'].sum()




__Q2: Number of unique customers placing orders each month:__

In [126]:
results[questions[1]] = hourly_orders_vanilla.groupby('month')['customer_id'].nunique()

__Q3: Average order value for each type of order channel:__

In [127]:
results[questions[2]]=hourly_orders_vanilla[['tot_gross_revenue_mrktpl', 'tot_gross_revenue_int', 'tot_gross_revenue_b2b']].mean()

__Q4: Top 5 zip codes by total number of orders:__

In [128]:
results[questions[3]]=hourly_orders_vanilla.groupby('postcode')['tot_orders'].sum().nlargest(5)

__Q5: Monthly Trends in Average Order Value (AOV)__

In [129]:
results[questions[4]] = hourly_orders_vanilla.groupby('month')['AOV'].mean()


__Q6: Average gross revenue by customer segment:__

In [130]:
results[questions[5]]=rfm.groupby('RFM_Segment')['monetary'].mean()

__Q7: Correlation between frequency and monetary scores:__

In [131]:
results[questions[6]]=rfm[['frequency_score', 'monetary_score']].corr().iloc[0, 1]

__Q8: Orders placed during weekends versus weekdays:__

In [132]:
hourly_orders_vanilla['type'] = hourly_orders_vanilla['day_of_week'].apply(lambda x: 'Weekend' if x in ['Saturday', 'Sunday'] else 'Weekday')
hourly_orders_vanilla.groupby('type').size()
results[questions[7]] = hourly_orders_vanilla.groupby('type').size()

__Q9: Percentage of total revenue from B2B orders:__

In [133]:
total_b2b_revenue = hourly_orders_vanilla['tot_gross_revenue_b2b'].sum()
total_revenue = hourly_orders_vanilla['tot_gross_revenue'].sum()
percentage_b2b = (total_b2b_revenue / total_revenue) * 100
print(percentage_b2b)
results[questions[8]] = percentage_b2b

0.16100577098289884


__Q10: Summary of the Latest Complete Month’s Data Across All Order Channels__

In [134]:
hourly_orders_2023 = hourly_orders_vanilla[hourly_orders_vanilla['year'] == 2023]
last_complete_month = hourly_orders_2023['month'].max()

latest_month_data = hourly_orders_2023[hourly_orders_2023['month'] == last_complete_month]

summary_last_month = latest_month_data.agg({
    'tot_orders_marketplace': 'sum',
    'tot_orders_int': 'sum',
    'tot_orders_b2b': 'sum',
    'tot_gross_revenue_mrktpl': 'sum',
    'tot_gross_revenue_int': 'sum',
    'tot_gross_revenue_b2b': 'sum'
})

summary_last_month

results[questions[9]] = summary_last_month


# Evaluation

In [139]:
# store results in a .txt file

with open('answers.txt', 'w') as f:
    for question, answer in results.items():
        f.write(f'{question}\n')
        f.write(f'{answer}\n\n')

print('Results have been saved to results.txt')

Results have been saved to results.txt


# Q&A 2

In [153]:
results_2 = {
}

questions_2 = [
    "Total Revenue for the Year",
    "Number of Orders per Customer Segment",
    "Monthly Change in Average Order Value (AOV)",
    "Top 10 Highest Grossing Customers and Their Last Order Date",
    "Year-over-Year Growth Rate by Order Type",
    "Complex Customer Lifetime Value Analysis"
]

for question in questions_2:
    results_2[f'Question:{question}'] = None
    
results_2


{'Question:Total Revenue for the Year': None,
 'Question:Number of Orders per Customer Segment': None,
 'Question:Monthly Change in Average Order Value (AOV)': None,
 'Question:Top 10 Highest Grossing Customers and Their Last Order Date': None,
 'Question:Year-over-Year Growth Rate by Order Type': None,
 'Question:Complex Customer Lifetime Value Analysis': None}

In [154]:
total_revenue = hourly_orders_vanilla['tot_gross_revenue'].sum()
results_2[questions_2[0]] = f'Results:{total_revenue}'

In [155]:
orders_per_segment = rfm['RFM_Segment'].value_counts()
results_2[questions_2[1]] = f'Results:{orders_per_segment}'

In [156]:
monthly_aov = hourly_orders_vanilla.groupby(hourly_orders_vanilla['month'])['AOV'].mean()
results_2[questions_2[2]] = f'Results:{monthly_aov}'

In [157]:
top_customers = hourly_orders_vanilla.groupby('customer_id').agg({
       'tot_gross_revenue': 'sum',
       'date': 'max'
   }).nlargest(10, 'tot_gross_revenue')

results_2[questions_2[3]] = f'Results:{top_customers}'

In [158]:
yearly_data = hourly_orders_vanilla.groupby(hourly_orders_vanilla['year']).agg({
       'tot_orders_marketplace': 'sum',
       'tot_orders_int': 'sum'
   })
yearly_data['marketplace_growth'] = yearly_data['tot_orders_marketplace'].pct_change() * 100
yearly_data['direct_growth'] = yearly_data['tot_orders_int'].pct_change() * 100

results_2[questions_2[4]] = f'Results:{yearly_data}'

In [160]:
# Calculate total revenue per customer
revenue_per_customer = hourly_orders_vanilla.groupby('customer_id')['tot_gross_revenue'].sum().reset_index()

# Merge with RFM DataFrame
revenue_per_customer = revenue_per_customer.merge(rfm[['customer_id', 'RFM_Segment']], on='customer_id')

# Calculate average CLV per RFM Segment
average_clv_per_segment = revenue_per_customer.groupby('RFM_Segment')['tot_gross_revenue'].mean()

results_2[questions_2[5]] = f'Results:{average_clv_per_segment}'

# Evaluation 2

In [161]:

with open('answers_2.txt', 'w') as f:
    for question, answer in results.items():
        f.write(f'{question}\n')
        f.write(f'{answer}\n\n')

print('Results have been saved to results.txt')

Results have been saved to results.txt
