# Additional Feature Ideas for the Dataset
This notebook demonstrates how to create additional features to analyse the dataset.

In [None]:
# Importing necessary libraries
import pandas as pd

# Load the dataset
data = pd.read_csv('../data/sales_data.csv')
data.head()

### 1. Sales Performance by Region
**Feature:** Calculate total or average sales for each region.

**Why it’s useful:**
 Highlights which regions perform better in terms of sales, enabling focused analysis or decision-making.

In [None]:
# sales performance by region
sales_by_region = data.groupby('Region')['Sales'].sum().reset_index()
print(sales_by_region)

### 2. Sales Growth Rate
**Feature:** Calculate the percentage change in sales day-over-day.

**Why it’s useful:** Helps identify trends or periods of growth/decline.

In [None]:
# sales growth rate
data['Sales_Growth'] = data['Sales'].pct_change() * 100
print(data[['Date', 'Sales', 'Sales_Growth']])

### 3. Day of the Week
**Feature:** Extract the day of the week from the Date column.

**Why it’s useful:** Identifies patterns in sales performance by the day of the week (e.g., higher sales on weekends).

In [None]:
# extract the day of the week from the Date column
data['Date'] = pd.to_datetime(data['Date'])
data['Day_of_Week'] = data['Date'].dt.day_name()
print(data[['Date', 'Day_of_Week']])

### Summarise sales by day of the week

In [None]:
# Summarise sales by day of the week
sales_by_weekday = data.groupby('Day_of_Week')['Sales'].sum().reset_index()

# Sorting the days of the week for better readability (optional)
from pandas.api.types import CategoricalDtype

# Define a custom order for the days of the week
day_order = CategoricalDtype(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
    ordered=True
)

# Apply the custom order and sort
sales_by_weekday['Day_of_Week'] = sales_by_weekday['Day_of_Week'].astype(day_order)
sales_by_weekday = sales_by_weekday.sort_values('Day_of_Week')

# Display the result
print(sales_by_weekday)

### 4. Cost-to-Sales Ratio

**Feature:** Calculate the ratio of cost to sales for each transaction.

**Why it’s useful:** Measures operational efficiency and profitability.

In [None]:
# cost-to-sales ratio
data['Cost_to_Sales_Ratio'] = data['Cost'] / data['Sales']
print(data[['Sales', 'Cost', 'Cost_to_Sales_Ratio']])

#### Cost-to-Sales Ratio by Region

In [None]:
# Calculate the cost-to-sales ratio for each transaction
data['Cost_to_Sales_Ratio'] = data['Cost'] / data['Sales']

# Calculate the average cost-to-sales ratio for each region
cost_ratio_by_region = data.groupby('Region')['Cost_to_Sales_Ratio'].mean().reset_index()

# Display the result
print(cost_ratio_by_region)

### 5. Cumulative Sales

**Feature:** Calculate the running total of sales over time.

**Why it’s useful:** Tracks overall performance and growth trends.

In [None]:
# total sales over time
data['Cumulative_Sales'] = data['Sales'].cumsum()
print(data[['Date', 'Sales', 'Cumulative_Sales']])

### 6. Discounts (Hypothetical)

**Feature:** Add a hypothetical discount column and calculate net sales.

**Why it’s useful:** Helps simulate real-world scenarios for understanding the impact of discounts.

In [None]:
# add hypothetical discount column and calculate net sales
data['Discount'] = data['Sales'] * 0.1  # Assume a 10% discount
data['Net_Sales'] = data['Sales'] - data['Discount']
print(data[['Sales', 'Discount', 'Net_Sales']])

### 7. Sales per Product

**Feature:** Calculate total sales for each product.

**Why it’s useful:** Identifies the most and least popular products.

In [None]:
# total sales for each product
sales_by_product = data.groupby('Product')['Sales'].sum().reset_index()
print(sales_by_product)

#### Identifying the Best-Selling Products by Region

In [None]:
# Identifying the Best-Selling Products by Region
# Calculate total sales for each product in each region
sales_by_product_region = data.groupby(['Region', 'Product'])['Sales'].sum().reset_index()

# Sort the sales within each region to find the best-selling product
best_selling_products_by_region = sales_by_product_region.sort_values(
    ['Region', 'Sales'], ascending=[True, False]
).groupby('Region').first().reset_index()

# Display the result
print(best_selling_products_by_region)
