In [32]:
# 0. Imports and helper functions

In [33]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import statsmodels.api as sm

try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except Exception:
    PROPHET_AVAILABLE = False

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [34]:
# For LSTM
try:
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
    KERAS_AVAILABLE = True
except Exception:
    KERAS_AVAILABLE = False

import warnings
warnings.filterwarnings('ignore')

In [35]:
# Folder setup
REPORTS_DIR = os.path.join("..", "reports")
IMAGES_DIR = os.path.join(REPORTS_DIR, "images")
os.makedirs(IMAGES_DIR, exist_ok=True)

def save_dataframe(df, filename):
    df.to_csv(os.path.join(REPORTS_DIR, filename), index=False)

def save_text(text, filename):
    with open(os.path.join(REPORTS_DIR, filename), "w") as f:
        f.write(text)

def save_plot(fig, filename):
    fig.savefig(os.path.join(IMAGES_DIR, filename), dpi=300, bbox_inches='tight')
    plt.close(fig)

In [36]:
# 1. Load data

In [37]:
def load_data(path='superstore.csv'):
    df = pd.read_csv(path, parse_dates=['order_date','ship_date'])
    df['postal_code'] = df['postal_code'].astype(str)
    return df

In [38]:
# 2. Basic cleaning & EDA helpers

In [39]:
def basic_cleaning(df):
    df = df.dropna(subset=['order_id','sales'])
    df['sales'] = pd.to_numeric(df['sales'], errors='coerce')
    df['profit'] = pd.to_numeric(df['profit'], errors='coerce')
    df['discount'] = pd.to_numeric(df['discount'], errors='coerce')
    df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce')
    return df

In [40]:
# Quick EDA summary
def eda_summary(df):
    summary = f"Rows: {len(df)}\nDate range: {df['order_date'].min()} -> {df['order_date'].max()}\n"
    summary += str(df[['sales','profit','discount','quantity']].describe())
    save_text(summary, "eda_summary.txt")

In [41]:
# 3. Forecasting sales by category / sub_category / region

In [42]:
# Aggregate sales to monthly series per group
def aggregate_monthly(df, group_cols=['category']):
    df2 = df.copy()
    df2['year_month'] = df2['order_date'].dt.to_period('M').dt.to_timestamp()
    grouped = df2.groupby(group_cols + ['year_month']).agg({'sales':'sum'}).reset_index()
    return grouped

In [43]:
# Prepare a dataframe for Prophet: columns ds, y
def prepare_prophet_df(grouped, group_values, group_cols=['category']):
    g = grouped.copy()
    for c,v in group_values.items():
        g = g[g[c]==v]
    g = g.sort_values('year_month')
    prophet_df = g[['year_month','sales']].rename(columns={'year_month':'ds','sales':'y'})
    prophet_df['ds'] = pd.to_datetime(prophet_df['ds'])
    return prophet_df

In [44]:
# Forecast with Prophet
def forecast_with_prophet(prophet_df, periods=12, freq='M'):
    m = Prophet()
    m.fit(prophet_df)
    future = m.make_future_dataframe(periods=periods, freq=freq)
    forecast = m.predict(future)
    return forecast

In [45]:
def run_category_forecasts(df, group_cols=['category'], periods=12):
    grouped = aggregate_monthly(df, group_cols=group_cols)
    forecasts = []
    for _, row in grouped[group_cols].drop_duplicates().iterrows():
        group_values = {c: row[c] for c in group_cols}
        prophet_df = prepare_prophet_df(grouped, group_values, group_cols=group_cols)
        if len(prophet_df) < 12:
            continue
        if PROPHET_AVAILABLE:
            forecast = forecast_with_prophet(prophet_df, periods=periods)
            fc = forecast[['ds','yhat']].tail(periods)
            fc['group'] = str(group_values)
            forecasts.append(fc)
    if forecasts:
        result = pd.concat(forecasts, ignore_index=True)
        save_dataframe(result, "category_forecasts.csv")
        return result
    return pd.DataFrame()

In [46]:
# 4. Customer churn modelling (RFM + classification)

In [47]:
def build_rfm(df, snapshot_date=None):
    if snapshot_date is None:
        snapshot_date = df['order_date'].max() + pd.Timedelta(days=1)
    cust = df.groupby('customer_id').agg({
        'order_date': lambda x: (snapshot_date - x.max()).days,
        'order_id': 'nunique',
        'sales': 'sum',
        'quantity': 'sum',
        'discount':'mean'
    }).rename(columns={'order_date':'recency','order_id':'frequency','sales':'monetary','discount':'avg_discount','quantity':'total_qty'}).reset_index()
    return cust

In [48]:
# Label churn: customers with no orders in last N days
def label_churn(df, churn_days=180):
    snapshot_date = df['order_date'].max() + pd.Timedelta(days=1)
    last_purchase = df.groupby('customer_id')['order_date'].max().reset_index()
    last_purchase['days_since_last'] = (snapshot_date - last_purchase['order_date']).dt.days
    last_purchase['churn_label'] = (last_purchase['days_since_last'] > churn_days).astype(int)
    return last_purchase[['customer_id','churn_label']]

In [49]:
def train_churn_model(df):
    rfm = build_rfm(df)
    churn = label_churn(df)
    data = rfm.merge(churn, on='customer_id')
    X = data[['recency','frequency','monetary','avg_discount','total_qty']].fillna(0)
    y = data['churn_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_train_s, y_train)
    y_pred = xgb.predict(X_test_s)
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, xgb.predict_proba(X_test_s)[:,1])
    }
    save_text(str(metrics), "churn_model_metrics.txt")
    return xgb

In [61]:
def discount_profit_regression(df):
    d = df.dropna(subset=['profit','discount','sales','quantity','category','region']).copy()
    d['log_sales'] = np.log1p(d['sales'])
    # Create dummy vars
    X = pd.concat([
        d[['discount','quantity','log_sales']], 
        pd.get_dummies(d['category'], drop_first=True, dtype=float),
        pd.get_dummies(d['region'], drop_first=True, dtype=float)
    ], axis=1)
    X = sm.add_constant(X)
    # Ensure numeric types
    X = X.apply(pd.to_numeric, errors='coerce')
    y = pd.to_numeric(d['profit'], errors='coerce')
    # Drop rows with NaNs after conversion
    valid_idx = X.notnull().all(axis=1) & y.notnull()
    X = X.loc[valid_idx]
    y = y.loc[valid_idx]
    model = sm.OLS(y, X).fit(cov_type='HC3')
    save_text(model.summary().as_text(), "discount_profit_regression.txt")
    return model

In [51]:
# 6. Putting it all together - run pipeline

In [62]:
def run_full_pipeline(path='../data/raw/superstore.csv'):
    df = basic_cleaning(load_data(path))
    eda_summary(df)
    run_category_forecasts(df)
    train_churn_model(df)
    discount_profit_regression(df)
    print("\n✅ Project completed successfully! All results and images are saved in ../reports and ../reports/images.")

In [63]:
if __name__ == '__main__':
    run_full_pipeline('../data/raw/superstore.csv')

21:59:19 - cmdstanpy - INFO - Chain [1] start processing
21:59:19 - cmdstanpy - INFO - Chain [1] done processing
21:59:20 - cmdstanpy - INFO - Chain [1] start processing
21:59:20 - cmdstanpy - INFO - Chain [1] done processing
21:59:21 - cmdstanpy - INFO - Chain [1] start processing
21:59:22 - cmdstanpy - INFO - Chain [1] done processing



✅ Project completed successfully! All results and images are saved in ../reports and ../reports/images.
