In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
# Load the dataset
file_path = "c:/Users/nazil/Downloads/data_set_hackathon.csv"  # Update with the correct path
df = pd.read_csv(file_path)

1.

In [None]:
# Convert 'order_date' to datetime
df['order_date'] = pd.to_datetime(df['order_date'], format='%d.%m.%Y')

# Extract 'order_month' (year and month) from 'order_date'
df['order_month'] = df['order_date'].dt.to_period('M')

# Group by 'order_month' and count distinct 'Customer Order Code'
monthly_orders = df.groupby('order_month').agg(
    Distinct_Orders=('Customer Order Code', 'nunique')
).reset_index()

# Display the distinct orders received in each month
print("Distinct Orders Received in Each Month:")
print(monthly_orders)

# Add seasonal features if needed (for example, you can create seasonal flags like Winter, Spring, etc.)
# Here, just creating dummy variables as an example:
monthly_orders['Season_Winter'] = (monthly_orders['order_month'].dt.month == 12) | (monthly_orders['order_month'].dt.month == 1) | (monthly_orders['order_month'].dt.month == 2)
monthly_orders['Season_Spring'] = (monthly_orders['order_month'].dt.month == 3) | (monthly_orders['order_month'].dt.month == 4) | (monthly_orders['order_month'].dt.month == 5)
monthly_orders['Season_Summer'] = (monthly_orders['order_month'].dt.month == 6) | (monthly_orders['order_month'].dt.month == 7) | (monthly_orders['order_month'].dt.month == 8)

# Log transformation to stabilize variance
monthly_orders['Log Distinct Orders'] = np.log1p(monthly_orders['Distinct_Orders'])

# Split data into train and test (80% train, 20% test)
train_size = int(len(monthly_orders) * 0.8)
train_data_log = monthly_orders['Log Distinct Orders'][:train_size]
test_data_log = monthly_orders['Log Distinct Orders'][train_size:]

# Exogenous variables (e.g., seasonality features)
exog_train = monthly_orders[['Season_Winter', 'Season_Spring', 'Season_Summer']][:train_size]
exog_test = monthly_orders[['Season_Winter', 'Season_Spring', 'Season_Summer']][train_size:]

# Fit SARIMAX model
sarimax_model_log = SARIMAX(
    train_data_log,
    exog=exog_train,
    order=(1, 0, 0),  # SARIMAX order (p, d, q)
    enforce_stationarity=False,
    enforce_invertibility=False
)
sarimax_results_log = sarimax_model_log.fit(disp=False)

# Forecast the next 5 months
forecast_steps = 5
forecast_log = sarimax_results_log.get_forecast(steps=forecast_steps, exog=exog_test.iloc[:forecast_steps])
forecast_mean_log = forecast_log.predicted_mean
forecast_ci_log = forecast_log.conf_int()

# Transform forecast back from log scale
forecast_mean = np.expm1(forecast_mean_log)
forecast_ci = np.expm1(forecast_ci_log)

# Calculate MAPE (Mean Absolute Percentage Error)
mape_sarimax_log = mean_absolute_percentage_error(np.expm1(test_data_log[:forecast_steps]), forecast_mean)
print(f"SARIMAX Model with Log Transformation - Mean Absolute Percentage Error (MAPE): {mape_sarimax_log * 100:.2f}%")

# Output the forecasted values for future months
forecast_list = list(forecast_mean)
print(f"Forecast for the next 5 months: {forecast_list}")

In [None]:
print(df.head())

In [None]:
print(df.info())

2.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd

# Convert 'items' column to integer (ensure proper numeric conversion)
df['items'] = pd.to_numeric(df['items'], errors='coerce')

# Set a threshold for determining if an order is 'demanded' based on 'items'
threshold = 5  # Modify this threshold based on your needs
df['demanded'] = (df['items'] > threshold).astype(int)  # Mark as 1 if items > threshold, else 0

# Create seasonal features (using map and one-hot encoding)
df['Season'] = df['order_date'].dt.month.map(lambda x: 'Winter' if x in [12, 1, 2] else
                                                     'Spring' if x in [3, 4, 5] else
                                                     'Summer' if x in [6, 7, 8] else 'Autumn')

# One-hot encode the 'Season' categorical variable
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_season = encoder.fit_transform(df[['Season']])
season_encoded_df = pd.DataFrame(encoded_season, columns=encoder.get_feature_names_out(['Season']))

# Ensure all potential seasons are in the column names (add missing columns manually if necessary)
for season in ['Season_Winter', 'Season_Spring', 'Season_Summer', 'Season_Autumn']:
    if season not in season_encoded_df.columns:
        season_encoded_df[season] = 0

# Encode 'Product Code' using LabelEncoder
label_encoder = LabelEncoder()
df['Product Code_encoded'] = label_encoder.fit_transform(df['Product Code'])

# Ensure categorical columns are of type 'category'
df['Customer Country Code'] = df['Customer Country Code'].astype('category')
df['Route'] = df['Route'].astype('category')

# Define features and target variable for the classification model
categorical_features = ['Customer Country Code', 'Route']  # These are categorical columns
numerical_features = ['value', 'items'] + list(season_encoded_df.columns) + ['Product Code_encoded']

# Define target variable (Product Code or Demand Count based on project objective)
y = df['demanded']  # If you're predicting demand (binary classification)
# y = df['Product Code']  # Uncomment if you're predicting the specific product code (multi-class classification)

# Define X by combining the features
X = df[categorical_features + numerical_features]

# Split the dataset into train and test sets (80% for training and 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing - Handle missing values and one-hot encode categorical variables using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='mean'), numerical_features)  # Impute missing values in numerical features
    ],
    remainder='passthrough'  # Keep the rest of the features as they are
)

# Define the RandomForestClassifier with class_weight='balanced' to handle any imbalance
classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

# Create a pipeline for preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Step 1: Fit the model without hyperparameter tuning (initial model)
pipeline.fit(X_train, y_train)

# Step 2: Evaluate the model on the test set
y_pred = pipeline.predict(X_test)

# Step 3: Print classification report to evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))

# Step 4: Calculate and print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy Score: {accuracy * 100:.2f}%")

# Step 5: Print confusion matrix to analyze misclassifications
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 6: Hyperparameter Tuning (Optional)
# Perform Grid Search for hyperparameter tuning if needed
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'classifier__max_depth': [10, 20, None],       # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    'classifier__min_samples_leaf': [1, 2, 4],     # Minimum number of samples required at each leaf node
    'classifier__max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider at each split
}

# Perform GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters from Grid Search:")
print(best_params)

# Train the best model from the grid search
best_pipeline = grid_search.best_estimator_

# Step 7: Evaluate the best model from the grid search
y_pred_best = best_pipeline.predict(X_test)

# Print classification report for the tuned model
print("Best Model Classification Report:")
print(classification_report(y_test, y_pred_best))


In [None]:
import matplotlib.pyplot as plt

feature_importances = best_rf_model.feature_importances_
features = X_train.columns
plt.barh(features, feature_importances)
plt.xlabel("Feature Importance")
plt.show()

3.