In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [24]:
# Load and preprocess the dataset
data = pd.read_csv("c:/Users/nazil/Downloads/data_set_hackathon.csv")

In [25]:
# Step 1: Prepare data
data['order_date'] = pd.to_datetime(data['order_date'])
data['Year-Month'] = data['order_date'].dt.to_period('M')

# Aggregate by month and count distinct order codes
monthly_orders = data.groupby('Year-Month')['Customer Order Code'].nunique()

# Step 2: Create seasonal dummy variables (including Winter as the reference category)
data['Season'] = data['order_date'].dt.month.map(lambda x: 'Winter' if x in [12, 1, 2] else
                                                 'Spring' if x in [3, 4, 5] else
                                                 'Summer' if x in [6, 7, 8] else 'Autumn')

# Create dummy variables for seasons (drop Winter as reference)
season_dummies = pd.get_dummies(data['Season'], drop_first=True)

# Merge seasonal data with the monthly orders
monthly_orders_df = pd.DataFrame(monthly_orders)
monthly_orders_df['Season'] = data.groupby('Year-Month')['Season'].first().values
monthly_orders_df = pd.concat([monthly_orders_df, season_dummies], axis=1)

# Step 3: Handle missing values using SimpleImputer (separate strategy for numerical and categorical features)
# Numeric columns (for imputation with mean)
numeric_features = monthly_orders_df.select_dtypes(include=[np.number]).columns

# Categorical columns (for imputation with most_frequent)
categorical_features = monthly_orders_df.select_dtypes(include=[object]).columns

# Create transformers: Use 'mean' for numeric columns and 'most_frequent' for categorical columns
numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

# Use ColumnTransformer to apply the transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformers to the data
monthly_orders_df_imputed = preprocessor.fit_transform(monthly_orders_df)

# Convert the imputed data back into a DataFrame
monthly_orders_df_imputed = pd.DataFrame(monthly_orders_df_imputed, columns=monthly_orders_df.columns)

# Step 4: Check for missing values after imputation
print("Missing values after imputation:\n", monthly_orders_df_imputed.isnull().sum())

# Step 5: Split the data for training and testing
X = monthly_orders_df_imputed[['Spring', 'Summer', 'Autumn']]  # Features: Season dummies
y = monthly_orders_df_imputed['Customer Order Code']  # Target: Number of distinct orders

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Forecast
y_pred = model.predict(X_test)

# Step 8: Evaluate Model using MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE for Monthly Orders Forecasting: {mape:.2f}%")

# Step 9: Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.index, y_test, label='Actual Orders', color='blue')
plt.plot(y_test.index, y_pred, label='Predicted Orders', color='red')
plt.title('Forecasting Monthly Orders using Linear Regression')
plt.legend()
plt.show()


Missing values after imputation:
 Customer Order Code    0
Season                 0
Spring                 0
Summer                 0
Winter                 0
dtype: int64


  data['order_date'] = pd.to_datetime(data['order_date'])


KeyError: "['Autumn'] not in index"

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Prepare data for classification
data['Season'] = data['order_date'].dt.month.map(lambda x: 'Winter' if x in [12, 1, 2] else
                                                 'Spring' if x in [3, 4, 5] else
                                                 'Summer' if x in [6, 7, 8] else 'Autumn')

# One-hot encoding for season and other categorical features
season_dummies = pd.get_dummies(data['Season'], drop_first=True)
data = pd.concat([data, season_dummies], axis=1)

# Define features and target variable
X = data[['value', 'Spring', 'Summer', 'Autumn', 'Customer Order Code']]  # Include relevant features
y = data['Product Code']  # Target: Product Code

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf_clf.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = rf_clf.predict(X_test)

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.preprocessing import QuantileTransformer

# Step 1: Quantile Transformation for Demand Quantities
quantile_transformer = QuantileTransformer(n_quantiles=4, output_distribution='uniform')
quantile_transformed = quantile_transformer.fit_transform(data[['items']])

# Add quantile categories to the dataset
data['Quantity_Quantile'] = pd.cut(quantile_transformed, bins=4, labels=['Low', 'Medium', 'High', 'Very High'])

# Display the quantile classification for quantities
print(data[['items', 'Quantity_Quantile']].head())


In [None]:
# Step 1: Quantile Transformation for Lead Times
lead_time_data = (data['requested_delivery_date'] - data['order_date']).dt.days
lead_time_quantile_transformer = QuantileTransformer(n_quantiles=4, output_distribution='uniform')
lead_time_quantiles = lead_time_quantile_transformer.fit_transform(lead_time_data.values.reshape(-1, 1))

# Add quantile categories to the dataset
data['LeadTime_Quantile'] = pd.cut(lead_time_quantiles, bins=4, labels=['Short', 'Medium', 'Long', 'Very Long'])

# Display the quantile classification for lead times
print(data[['requested_delivery_date', 'order_date', 'LeadTime_Quantile']].head())


In [None]:
import numpy as np

# Step 1: Define a function to simulate demand based on uncertainties
def monte_carlo_simulation(data, n_simulations=1000, lead_time_horizon=5):
    simulated_demand = []

    for _ in range(n_simulations):
        # Randomly sample product demand categories, quantity quantiles, and lead times
        product_sample = data['Product Code'].sample()
        quantity_sample = data['Quantity_Quantile'].sample()
        lead_time_sample = data['LeadTime_Quantile'].sample()

        # Simulate demand based on these categories (we can use random values or predefined distributions)
        simulated_order_quantity = np.random.choice([1, 2, 3], p=[0.3, 0.4, 0.3])  # example distribution
        simulated_lead_time = np.random.choice(['Short', 'Medium', 'Long', 'Very Long'], p=[0.4, 0.3, 0.2, 0.1])

        # Simulate demand over the next 5 months for the selected product
        demand_for_product = simulated_order_quantity * lead_time_horizon
        simulated_demand.append(demand_for_product)

    # Step 2: Return the average demand across simulations
    return np.mean(simulated_demand)

# Step 3: Simulate for each product
product_demand_simulation = {}
for product in data['Product Code'].unique():
    product_data = data[data['Product Code'] == product]
    product_demand_simulation[product] = monte_carlo_simulation(product_data)

# Step 4: Display simulated demand for each product
for product, demand in product_demand_simulation.items():
    print(f"Product {product} - Simulated Demand: {demand:.2f}")


In [None]:
# Assuming y_test and predicted values are available
mape_5_month = mean_absolute_percentage_error(y_test_5_month, predicted_5_month)
mape_2_month = mean_absolute_percentage_error(y_test_2_month, predicted_2_month)

print(f"MAPE for 5-Month Lead Time: {mape_5_month:.2f}%")
print(f"MAPE for 2-Month Lead Time: {mape_2_month:.2f}%")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_percentage_error
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox

In [None]:
# Load and preprocess the dataset
data = pd.read_csv("c:/Users/nazil/Downloads/data_set_hackathon.csv")
data['order_date'] = pd.to_datetime(data['order_date'])
data['requested_delivery_date'] = pd.to_datetime(data['requested_delivery_date'])
data['Year-Month'] = data['order_date'].dt.to_period('M')

In [None]:
# Group Data by Monthly Orders
monthly_orders = data.groupby('Year-Month')['Customer Order Code'].nunique()

# Split Data into Train/Test
train = monthly_orders[:int(len(monthly_orders)*0.8)]
test = monthly_orders[int(len(monthly_orders)*0.8):]

In [None]:
print(data.info())

Time Series Analysis and SARIMA Model

In [None]:
# Determine the maximum allowed lags
max_lags = len(monthly_orders) // 2

# ACF Plot
plot_acf(monthly_orders, lags=max_lags)
plt.title("ACF of Monthly Orders")
plt.show()

# PACF Plot
plot_pacf(monthly_orders, lags=max_lags)
plt.title("PACF of Monthly Orders")
plt.show()

In [None]:
# SARIMA Model Fitting
model = SARIMAX(train, order=(1, 1, 1), seasonal_order=(0, 1, 1, 12))
model_fit = model.fit(disp=False)

In [None]:
# Residual Diagnostics
residuals = model_fit.resid

# Convert PeriodIndex to DatetimeIndex for plotting
residuals.index = residuals.index.to_timestamp()

# Plot residuals
plt.figure(figsize=(10, 6))
plt.plot(residuals, label='Residuals', color='blue')
plt.axhline(0, linestyle='--', color='red', label='Zero Line')
plt.title("SARIMA Model Residuals")
plt.xlabel("Time")
plt.ylabel("Residuals")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Statistical Test for White Noise
ljung_test = acorr_ljungbox(residuals, lags=[10], return_df=True)
print("Ljung-Box Test Results:\n", ljung_test)

In [None]:
# Forecast and Evaluation
forecast = model_fit.get_forecast(steps=len(test))
forecast_values = forecast.predicted_mean
forecast_ci = forecast.conf_int()

In [None]:
# MAPE Calculation for SARIMA
mape = mean_absolute_percentage_error(test.values, forecast_values)
print(f"SARIMA MAPE: {mape:.2%}")

In [None]:
# Convert PeriodIndex to DatetimeIndex for plotting
monthly_orders.index = monthly_orders.index.to_timestamp()
forecast_ci.index = forecast_ci.index.to_timestamp()

# Plot Forecast
plt.figure(figsize=(10, 6))
plt.plot(monthly_orders.index, monthly_orders, label='Historical Orders', color='blue')
plt.plot(forecast_values.index, forecast_values, label='Forecasted Orders', color='red')
plt.fill_between(forecast_ci.index, forecast_ci.iloc[:, 0], forecast_ci.iloc[:, 1], color='red', alpha=0.2)
plt.title("SARIMA Forecast with 95% Confidence Intervals")
plt.xlabel("Date")
plt.ylabel("Number of Distinct Orders")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Print forecast values and confidence intervals
print("Forecasted Values:")
print(forecast.predicted_mean)
print("\nConfidence Intervals:")
print(forecast_ci)

Classification Model for Product Choice

Empirical Quantiles for Demand and Lead Time

In [None]:
from scipy.stats.mstats import mquantiles

# Empirical Quantiles for Quantity
quantile_25, quantile_50, quantile_75 = mquantiles(data['items'].astype(float), prob=[0.25, 0.5, 0.75])
print(f"Demand Quantiles - 25%: {quantile_25}, 50%: {quantile_50}, 75%: {quantile_75}")

# Empirical Quantiles for Lead Time
lead_time = (data['requested_delivery_date'] - data['order_date']).dt.days
lead_time_quantiles = mquantiles(lead_time, prob=[0.25, 0.5, 0.75])
print(f"Lead Time Quantiles - 25%: {lead_time_quantiles[0]}, 50%: {lead_time_quantiles[1]}, 75%: {lead_time_quantiles[2]}")


Monte Carlo Simulation for Total Demand

In [None]:
n_simulations = 1000
simulated_total_demand_5_month = []

for _ in range(n_simulations):
    simulated_monthly_orders = np.random.poisson(train.mean())
    simulated_quantity = np.random.choice([quantile_25, quantile_50, quantile_75])
    simulated_lead_time = np.random.choice(lead_time_quantiles)
    
    total_demand = simulated_monthly_orders * simulated_quantity * simulated_lead_time
    simulated_total_demand_5_month.append(total_demand)

# Analyze Simulation Results
simulated_total_demand_5_month = np.array(simulated_total_demand_5_month)
mean_demand = simulated_total_demand_5_month.mean()
ci_5_month = np.percentile(simulated_total_demand_5_month, [2.5, 97.5])

print(f"Simulated 5-Month Demand: Mean={mean_demand}, 95% CI={ci_5_month}")
