# Feature Engineering for Sales Forecasting
## Creating Time-Based Features for Predictive Modeling

### 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the sales data
df = pd.read_csv("data/sales.csv")

# Display initial shape
print(f"Initial dataset shape: {df.shape}")
df.head()

### 2. Data Preprocessing

In [None]:
# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort by date to ensure proper time series order
df = df.sort_values('date').reset_index(drop=True)

print("Data sorted by date")
df.head()

### 3. Lag Features
Create lag features to capture historical sales patterns

In [None]:
# Create lag features (previous time periods)
df['lag_1'] = df['sales'].shift(1)  # Previous day/period sales
df['lag_3'] = df['sales'].shift(3)  # Sales from 3 periods ago

print("Lag features created")
df[['date', 'sales', 'lag_1', 'lag_3']].head(5)

### 4. Rolling Window Statistics
Calculate moving averages and standard deviations to capture trends

In [None]:
# Rolling mean - captures short-term trends
df['rolling_mean_3'] = df['sales'].rolling(window=3).mean()

# Rolling standard deviation - captures volatility
df['rolling_std_3'] = df['sales'].rolling(window=3).std()

print("Rolling statistics calculated")
df[['date', 'sales', 'rolling_mean_3', 'rolling_std_3']].head(5)

### 5. Calendar-Based Features
Extract temporal patterns from date

In [None]:
# Extract month and quarter for seasonality
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter

# Optional: Add more calendar features
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_month'] = df['date'].dt.day
df['week_of_year'] = df['date'].dt.isocalendar().week

print("Calendar features extracted")
df[['date', 'sales', 'month', 'quarter', 'day_of_week']].head()

### 6. Handle Missing Values
Remove rows with NaN values created by lag and rolling features

In [None]:
# Check missing values before cleaning
print("Missing values before cleaning:")
print(df.isnull().sum())
print(f"\nDataset shape before: {df.shape}")

# Remove rows with NaN values
df = df.dropna()

print(f"\nDataset shape after: {df.shape}")
print(f"Rows removed: {df.shape[0]}")

### 7. Final Feature Set

In [None]:
# Display final engineered features
print("Final feature set:")
print(df.columns.tolist())
print(f"\nTotal features: {df.shape[1]}")
print(f"Total samples: {df.shape[0]}")

df.head(10)

### 8. Feature Summary Statistics

In [None]:
# Statistical summary of engineered features
df.describe()

### 9. Save Processed Data (Optional)

In [None]:
# Save the feature-engineered dataset
# df.to_csv('data/sales_features.csv', index=False)
# print("Processed data saved to 'data/sales_features.csv'")

---
## Summary of Engineered Features

**Lag Features:**
- `lag_1`: Sales from previous period (captures immediate past)
- `lag_3`: Sales from 3 periods ago (captures weekly/short-term patterns)

**Rolling Statistics:**
- `rolling_mean_3`: 3-period moving average (smooths out noise)
- `rolling_std_3`: 3-period standard deviation (captures volatility)

**Calendar Features:**
- `month`: Month of year (1-12) for seasonal patterns
- `quarter`: Quarter of year (1-4) for quarterly trends
- `day_of_week`: Day of week (0=Monday, 6=Sunday)
- `day_of_month`: Day within month (1-31)
- `week_of_year`: Week number (1-52)

**Next Steps:**
1. Train-test split
2. Feature scaling/normalization
3. Model training (Linear Regression, Random Forest, XGBoost)
4. Model evaluation and comparison