In [4]:
import pandas as pd 
import os 
import seaborn as sns 
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker

## Loading Data & quick cleaning: 

In [2]:
file_path = r'D:\This PC\Documents\Data_Analysis\HUJI_course\projects_and_datasets\final_project\coffee_shop_sales_data\modified_coffee_shop_sales.csv' 

stores_df = pd.read_csv(file_path) 

stores_df['transaction_date'] = pd.to_datetime(stores_df['transaction_date'])
stores_df['transaction_time'] = pd.to_datetime(stores_df['transaction_time'], format='%H:%M:%S').dt.time

stores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   transaction_id     149116 non-null  int64         
 1   transaction_date   149116 non-null  datetime64[ns]
 2   transaction_time   149116 non-null  object        
 3   transaction_qty    149116 non-null  int64         
 4   store_id           149116 non-null  int64         
 5   store_location     149116 non-null  object        
 6   product_id         149116 non-null  int64         
 7   unit_price         149116 non-null  float64       
 8   product_category   149116 non-null  object        
 9   product_type       149116 non-null  object        
 10  product_detail     149116 non-null  object        
 11  revenue            149116 non-null  float64       
 12  time_range         149116 non-null  object        
 13  day_name           149116 non-null  object  

# 

In [None]:
def plot_data(dframe, group_col, metric, value_col=None):
    
    """
    metric: 'revenue', 'frequency', or 'volume'
    group_col: the column on X-axis (e.g., 'transaction_month', 'store_location')
    value_col: the column to sum (not needed for 'transactions')
    """
    
    # creating dict for visual styles and labels configuration
    configs = {
        'revenue': {'palette': 'flare', 'label': 'Total Revenue ($USD)', 'title_part': 'Total Revenue'. 'prefix': '$'},
        'volume' : {'palette': 'mako', 'label': 'Units Sold', 'title_part': 'Total Units Sold', 'prefix': ''},
        'frequency': {'palette': 'crest', 'label': 'Purchase Frequency', 'title_part': 'Transaction Count', 'prefix': ''}
    }
    
    # accessing the configurations based on selected metric
    config = configs[metric]
    
    # date aggregation
    
    if metric == 'frequency':
        plot_data = dframe[group_col].values_count().reset_index()
        plot_data.columns = [group_col, 'metric_value']
        
    else: 
        plot_data = dframe.groupby(group_col)[value_col].sum().reset_index()
        plot_data.columns = [group_col, 'metric_value']
        
        
    # handling a special case for plotting top 10 items
    if plot_data[group_col].nunique() >= 20: 
        plot_data = plot_data.sort_values(by='metric_value', ascending=False).head(10)
        
        
            
    # handling the chronological order
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    
    # checking if the values in the days/months list is present in the dframe
    # returns true if at least one match is found. false if no match at all
    is_month = any(m in plot_data[group_col].values for m in months)
    is_day = any(d in plot_data[group_col].values for d in days)
    
    # deciding the sorting (chronological vs value based)
    if is_month: 
        plot_data[group_col] = pd.Categorical(values=plot_data[group_col], categories=months, ordered=True)
        plot_data = plot_data.sort_values(group_col)
        
    elif is_day: 
        plot_data[group_col] = pd.Categorical(values=plot_data[group_col], categories=days, ordered=True)
        plot_data = plot_data.sort_values(group_col)
    
    else:
        plot_data = plot_data.sort_values(by='metric_value', ascending=False)    
             
    # deciding the layout vertical vs horizontal
    # vertical for days, months and categories with few items
    
    if is_month or is_day or plot_data[group_col].nunique() <= 4: 
        x_val, y_val = group_col, 'metric_value'
        orient = 'v'
        
    else:
        x_val, y_val = 'metric_value', group_col
        orient = 'h'
        
    # plotting
    plt.figure(figsize=(10, 6))
    sns.set_theme('dark')
    
    ax = sns.barplot(
        data= plot_data,
        x= x_val,
        y= y_val,
        hue= 'metric_value',
        palette= config['palette'],
        legend=False,
        width= 0.3
    )
    
    sns.despine()
    
    # the colours of the graph
    plt.gca().set_facecolor("#000000")
    plt.gcf().set_facecolor('#000000')
    plt.tick_params(axis='both', colors='white')
    
    # setting the dynamic tick labels: 
    def tick_formatting(val, pos):
        if val == 0:
            return '0'
        return f"{config['prefix']}{int(val/1000)}K"
    
    formatter = ticker.FuncFormatter(tick_formatting)
    
    if orient == 'v': 
        ax.yaxis.set_major_formatter(formatter)
    else:
        ax.xaxis.set_major_formatter(formatter)
    
        
    # titles and lables:
    clean_name = group_col.replace('_', ' ').title()
    
    plt.title(f"{config['title_part']} by {clean_name}", fontsize = 16, color='white')
    plt.xlabel(' ' if orient=='v' else config['label'], fontsize=12, color='white', labelpad=20)
    plt.ylabel(' ' if orient == 'h' else config['label'], fontsize=12, color='white', labelpad=20)
        
    plt.tight_layout()
    plt.show()
        
    
    
    
    
        
        
    
    
    
            


SyntaxError: invalid syntax (4126430898.py, line 11)