In [None]:
import pandas as pd

# Load the dataset
file_path = 'PO_Data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

In [None]:
# Basic analysis of the dataset

# Checking for missing values
missing_values = data.isnull().sum()

# Data types of the columns
data_types = data.dtypes

# Basic statistics
basic_stats = data.describe(include='all', datetime_is_numeric=True)

missing_values, data_types, basic_stats

In [None]:
# Data Preparation for Demand Forecasting

# Convert date fields to datetime
data['PO_CREATED_AT'] = pd.to_datetime(data['PO_CREATED_AT'])
data['PO_ARRIVED_AT'] = pd.to_datetime(data['PO_ARRIVED_AT'])

# Select relevant features for demand forecasting
forecasting_data = data[['PO_CREATED_AT', 'SKU', 'ORDERED_QTY']]

# Aggregate data to monthly level
forecasting_data['Month'] = forecasting_data['PO_CREATED_AT'].dt.to_period('M')
monthly_demand = forecasting_data.groupby(['Month', 'SKU']).agg(Total_Ordered_Qty=('ORDERED_QTY', 'sum')).reset_index()

# Checking the aggregated data
monthly_demand.head()

In [None]:
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# Selecting a SKU with the most records
top_sku = monthly_demand['SKU'].value_counts().idxmax()

# Extracting time series for the selected SKU
sku_demand_data = monthly_demand[monthly_demand['SKU'] == top_sku].set_index('Month')
sku_demand_data = sku_demand_data.sort_index()

# Plotting the time series for the selected SKU
plt.figure(figsize=(12, 6))
plt.plot(sku_demand_data.index.to_timestamp(), sku_demand_data['Total_Ordered_Qty'], marker='o')
plt.title(f'Monthly Demand for SKU: {top_sku}')
plt.xlabel('Month')
plt.ylabel('Total Ordered Quantity')
plt.grid(True)
plt.show()

# Preparing for ARIMA model
# We'll take a subset of the data to train the model and use the rest for testing
train_data = sku_demand_data['Total_Ordered_Qty'][:-12]  # Leaving last 12 months for testing
test_data = sku_demand_data['Total_Ordered_Qty'][-12:]

# Fit an ARIMA model
# Note: The parameters (p,d,q) are set to (1,1,1) as a starting point. These can be optimized.
model = ARIMA(train_data, order=(1, 1, 1))
fitted_model = model.fit()

# Forecast
forecast = fitted_model.forecast(steps=12)

# Plotting forecast against actual data
plt.figure(figsize=(12, 6))
plt.plot(train_data.index.to_timestamp(), train_data, label='Train Data', marker='o')
plt.plot(test_data.index.to_timestamp(), test_data, label='Test Data', marker='o')
plt.plot(test_data.index.to_timestamp(), forecast, label='Forecast', marker='o')
plt.title(f'ARIMA Forecast vs Actuals for SKU: {top_sku}')
plt.xlabel('Month')
plt.ylabel('Total Ordered Quantity')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from statsmodels.tsa.arima.model import ARIMAResults
import itertools
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define the p, d, and q parameters to take any value between 0 and 2
p = d = q = range(0, 3)

# Generate all different combinations of p, d, and q triplets
pdq = list(itertools.product(p, d, q))

# Grid search for the optimal ARIMA parameters
best_score, best_cfg = float("inf"), None

for param in pdq:
    try:
        model = ARIMA(train_data, order=param)
        results = model.fit()
        forecast = results.forecast(steps=12)
        mse = mean_squared_error(test_data, forecast)
        if mse < best_score:
            best_score, best_cfg = mse, param
    except:
        continue

# Best parameters
best_cfg, best_score

# Refitting the model with the best parameters and evaluating
best_model = ARIMA(train_data, order=best_cfg)
fitted_best_model = best_model.fit()

# Forecast with the optimized model
optimized_forecast = fitted_best_model.forecast(steps=12)

# Evaluation metrics
mae = mean_absolute_error(test_data, optimized_forecast)
rmse = np.sqrt(mean_squared_error(test_data, optimized_forecast))

optimized_forecast, mae, rmse

In [None]:
# Data Preparation for Vendor Performance Analysis

# Convert date fields to datetime (if not already converted)
data['PO_SHIPPED_AT'] = pd.to_datetime(data['PO_SHIPPED_AT'], errors='coerce')

# Calculating delivery delay (in days)
data['Delivery_Delay'] = (data['PO_ARRIVED_AT'] - data['PO_SHIPPED_AT']).dt.days

# Calculating fulfillment accuracy (ratio of received qty to ordered qty)
data['Fulfillment_Accuracy'] = data['TOTAL_RECEIVED_QTY'] / data['ORDERED_QTY']

# Aggregate data by vendor
vendor_performance = data.groupby('VENDOR_NAME').agg(
    Average_Delay=('Delivery_Delay', 'mean'),
    Average_Fulfillment_Accuracy=('Fulfillment_Accuracy', 'mean'),
    Order_Frequency=('PO_NUMBER', 'nunique')
).reset_index()

# Checking the aggregated vendor performance data
vendor_performance.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



# Removing records where 'Average_Delay' is NaN
vendor_performance_cleaned = vendor_performance.dropna(subset=['Average_Delay'])

# Preparing the data again for the regression model
X_clean = vendor_performance_cleaned.drop(['VENDOR_NAME', 'Average_Delay'], axis=1)
y_clean = vendor_performance_cleaned['Average_Delay']

# Impute missing values in features
imputer = SimpleImputer(strategy='median')
X_clean_imputed = imputer.fit_transform(X_clean)

# Splitting the dataset into training and testing sets again
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean_imputed, y_clean, test_size=0.2, random_state=42)

# Linear Regression Model
model_clean = LinearRegression()
model_clean.fit(X_train_clean, y_train_clean)

# Predictions and Model Evaluation
y_pred_clean = model_clean.predict(X_test_clean)
mse_clean = mean_squared_error(y_test_clean, y_pred_clean)
r2_clean = r2_score(y_test_clean, y_pred_clean)

mse_clean, r2_clean

In [None]:
# Convert PO_CLOSED_AT to datetime
data['PO_CLOSED_AT'] = pd.to_datetime(data['PO_CLOSED_AT'], errors='coerce')

# Calculating PO Cycle Time (in days)
data['PO_Cycle_Time'] = (data['PO_CLOSED_AT'] - data['PO_CREATED_AT']).dt.days

# Preparing data for the regression model
# Selecting relevant features
cycle_time_data = data[['WAREHOUSE_ID', 'VENDOR_NAME', 'ORDERED_QTY', 'PO_Cycle_Time']]

# Handling missing values in PO_Cycle_Time
# Dropping rows where PO_Cycle_Time is NaN
cycle_time_data_cleaned = cycle_time_data.dropna(subset=['PO_Cycle_Time'])

# Encoding the 'VENDOR_NAME' as it's a categorical variable
cycle_time_data_cleaned = pd.get_dummies(cycle_time_data_cleaned, columns=['VENDOR_NAME'])

# Defining the feature matrix (X) and the target variable (y)
X_cycle_time = cycle_time_data_cleaned.drop('PO_Cycle_Time', axis=1)
y_cycle_time = cycle_time_data_cleaned['PO_Cycle_Time']

# Splitting the dataset into training and testing sets
X_train_cycle, X_test_cycle, y_train_cycle, y_test_cycle = train_test_split(X_cycle_time, y_cycle_time, test_size=0.2, random_state=42)

# Checking the first few rows of the feature matrix
X_train_cycle.head()

In [None]:
# Linear Regression Model for PO Cycle Time Prediction
model_cycle_time = LinearRegression()
model_cycle_time.fit(X_train_cycle, y_train_cycle)

# Predictions and Model Evaluation
y_pred_cycle_time = model_cycle_time.predict(X_test_cycle)
mse_cycle_time = mean_squared_error(y_test_cycle, y_pred_cycle_time)
r2_cycle_time = r2_score(y_test_cycle, y_pred_cycle_time)

mse_cycle_time, r2_cycle_time

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor for PO Cycle Time Prediction
rf_model_cycle_time = RandomForestRegressor(random_state=42)
rf_model_cycle_time.fit(X_train_cycle, y_train_cycle)

# Predictions and Model Evaluation
y_pred_rf_cycle_time = rf_model_cycle_time.predict(X_test_cycle)
mse_rf_cycle_time = mean_squared_error(y_test_cycle, y_pred_rf_cycle_time)
r2_rf_cycle_time = r2_score(y_test_cycle, y_pred_rf_cycle_time)

mse_rf_cycle_time, r2_rf_cycle_time

In [None]:
from sklearn.model_selection import GridSearchCV

# Parameters for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search with Cross-Validation
rf_grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                              param_grid=param_grid,
                              cv=3,
                              n_jobs=-1,
                              verbose=2)

rf_grid_search.fit(X_train_cycle, y_train_cycle)

# Best parameters
rf_best_params = rf_grid_search.best_params_
rf_best_params

In [None]:
# Data Preparation for On-Time Delivery Prediction

# Creating the target variable (1 if on-time, 0 if late)
data['On_Time_Delivery'] = ((data['PO_ARRIVED_AT'] - data['PO_CREATED_AT']).dt.days <= 5).astype(int)

# Selecting relevant features
delivery_data = data[['WAREHOUSE_ID', 'VENDOR_NAME', 'ORDERED_QTY', 'On_Time_Delivery']]

# Handling missing values and encoding categorical variables
delivery_data_cleaned = delivery_data.dropna()
delivery_data_cleaned = pd.get_dummies(delivery_data_cleaned, columns=['VENDOR_NAME'])

# Defining the feature matrix (X) and the target variable (y)
X_delivery = delivery_data_cleaned.drop('On_Time_Delivery', axis=1)
y_delivery = delivery_data_cleaned['On_Time_Delivery']

# Splitting the dataset into training and testing sets
X_train_delivery, X_test_delivery, y_train_delivery, y_test_delivery = train_test_split(X_delivery, y_delivery, test_size=0.2, random_state=42)

# Checking the balance of the target variable
y_delivery.value_counts()

In [None]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Training the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_delivery, y_train_delivery)

# Predictions on the test set
y_pred_delivery = rf_classifier.predict(X_test_delivery)

# Evaluation metrics
accuracy = accuracy_score(y_test_delivery, y_pred_delivery)
roc_auc = roc_auc_score(y_test_delivery, y_pred_delivery)
classification_rep = classification_report(y_test_delivery, y_pred_delivery)

accuracy, roc_auc, classification_rep

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Re-importing the dataset
data = pd.read_csv(file_path, parse_dates=['PO_CREATED_AT', 'PO_ARRIVED_AT', 'PO_SHIPPED_AT', 'PO_CLOSED_AT'])

# Calculating PO Cycle Time (in days)
data['PO_Cycle_Time'] = (data['PO_CLOSED_AT'] - data['PO_CREATED_AT']).dt.days

# Preparing data for the regression model
cycle_time_data = data[['WAREHOUSE_ID', 'VENDOR_NAME', 'ORDERED_QTY', 'PO_Cycle_Time']]

# Dropping rows with NaN values in PO_Cycle_Time and encoding categorical variables
cycle_time_data_cleaned = cycle_time_data.dropna(subset=['PO_Cycle_Time'])
cycle_time_data_cleaned = pd.get_dummies(cycle_time_data_cleaned, columns=['VENDOR_NAME'])

# Defining the feature matrix (X) and the target variable (y)
X_cycle_time = cycle_time_data_cleaned.drop('PO_Cycle_Time', axis=1)
y_cycle_time = cycle_time_data_cleaned['PO_Cycle_Time']

# Splitting the dataset into training and testing sets
X_train_cycle, X_test_cycle, y_train_cycle, y_test_cycle = train_test_split(X_cycle_time, y_cycle_time, test_size=0.2, random_state=42)

# Random Forest Regressor for PO Cycle Time Prediction
rf_model_cycle_time = RandomForestRegressor(random_state=42)
rf_model_cycle_time.fit(X_train_cycle, y_train_cycle)

# Predictions and Model Evaluation
y_pred_cycle_time = rf_model_cycle_time.predict(X_test_cycle)
mse_cycle_time = mean_squared_error(y_test_cycle, y_pred_cycle_time)
r2_cycle_time = r2_score(y_test_cycle, y_pred_cycle_time)

mse_cycle_time, r2_cycle_time

In [None]:
# Re-importing the dataset for on-time delivery prediction
data = pd.read_csv(file_path, parse_dates=['PO_CREATED_AT', 'PO_ARRIVED_AT', 'PO_SHIPPED_AT', 'PO_CLOSED_AT'])

# Creating the target variable for on-time delivery (1 if within 5 days, 0 otherwise)
data['On_Time_Delivery'] = ((data['PO_ARRIVED_AT'] - data['PO_CREATED_AT']).dt.days <= 5).astype(int)

# Selecting relevant features and handling missing values
delivery_data = data[['WAREHOUSE_ID', 'VENDOR_NAME', 'ORDERED_QTY', 'On_Time_Delivery']]
delivery_data_cleaned = delivery_data.dropna()
delivery_data_cleaned = pd.get_dummies(delivery_data_cleaned, columns=['VENDOR_NAME'])

# Defining the feature matrix (X) and the target variable (y)
X_delivery = delivery_data_cleaned.drop('On_Time_Delivery', axis=1)
y_delivery = delivery_data_cleaned['On_Time_Delivery']

# Splitting the dataset into training and testing sets
X_train_delivery, X_test_delivery, y_train_delivery, y_test_delivery = train_test_split(X_delivery, y_delivery, test_size=0.2, random_state=42)

# Training the Random Forest Classifier for on-time delivery prediction
rf_classifier_delivery = RandomForestClassifier(random_state=42)
rf_classifier_delivery.fit(X_train_delivery, y_train_delivery)

# Predictions and Model Evaluation
y_pred_delivery = rf_classifier_delivery.predict(X_test_delivery)
accuracy_delivery = accuracy_score(y_test_delivery, y_pred_delivery)
roc_auc_delivery = roc_auc_score(y_test_delivery, y_pred_delivery)
classification_rep_delivery = classification_report(y_test_delivery, y_pred_delivery)

accuracy_delivery, roc_auc_delivery, classification_rep_delivery

In [None]:
# Converting date columns to datetime format
date_columns = ['PO_CREATED_AT', 'PO_ARRIVED_AT', 'PO_SHIPPED_AT', 'PO_CLOSED_AT']
data[date_columns] = data[date_columns].apply(pd.to_datetime, errors='coerce')

# Checking the conversion
data[date_columns].dtypes

In [None]:
# Handling missing values

# Filling missing values in 'NOTE', 'DESCRIPTION', 'MASTER_ID', and 'MPN' with 'Unknown'
fill_columns = ['NOTE', 'DESCRIPTION', 'MASTER_ID', 'MPN']
data[fill_columns] = data[fill_columns].fillna('Unknown')

# For 'TOTAL_RECEIVED_QTY', filling missing values with median
data['TOTAL_RECEIVED_QTY'] = data['TOTAL_RECEIVED_QTY'].fillna(data['TOTAL_RECEIVED_QTY'].median())

# Assessing the impact of missing values in 'PO_SHIPPED_AT' and 'PO_CLOSED_AT'
missing_shipped_closed = data[['PO_SHIPPED_AT', 'PO_CLOSED_AT']].isnull().mean()

missing_shipped_closed, data[fill_columns].isnull().sum(), data['TOTAL_RECEIVED_QTY'].isnull().sum()

In [None]:
import numpy as np

# Creating a feature for on-time arrival
data['ON_TIME_DELIVERY'] = np.where((data['PO_ARRIVED_AT'] - data['PO_CREATED_AT']).dt.days <= 5, 1, 0)

# Dropping 'PO_SHIPPED_AT' and 'PO_CLOSED_AT' due to high missing values
data = data.drop(columns=['PO_SHIPPED_AT', 'PO_CLOSED_AT'])

# Checking the new feature and the updated dataset
data[['PO_CREATED_AT', 'PO_ARRIVED_AT', 'ON_TIME_DELIVERY']].head(), data.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Exploratory Data Analysis

# Summary statistics for numerical features
numerical_summary = data.describe()

# Distribution of the target variable (ON_TIME_DELIVERY)
plt.figure(figsize=(8, 5))
sns.countplot(data['ON_TIME_DELIVERY'])
plt.title('Distribution of On-Time Delivery')
plt.xlabel('On-Time Delivery')
plt.ylabel('Count')
plt.show()

# Correlation heatmap for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

numerical_summary

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Selecting numerical and categorical features
numerical_features = ['ORDERED_QTY', 'TOTAL_RECEIVED_QTY']
categorical_features = ['VENDOR_NAME', 'WAREHOUSE_ID', 'SKU', 'MASTER_ID', 'MPN']

# Preparing the features (X) and target (y)
X = data[numerical_features + categorical_features]
y = data['ON_TIME_DELIVERY']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MaxAbsScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting the preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

X_train.shape, X_test.shape

In [None]:
from sklearn.decomposition import PCA

# Applying PCA for dimensionality reduction
# We choose a number of components that explains a substantial amount of variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Checking the shape after PCA
X_train_pca.shape, X_test_pca.shape, pca.n_components_

In [None]:
from sklearn.decomposition import TruncatedSVD

# Reducing the number of components in Truncated SVD to address memory issue
svd = TruncatedSVD(n_components=500)
try:
    X_train_svd = svd.fit_transform(X_train)
    X_test_svd = svd.transform(X_test)
    svd_success = True
except Exception as e:
    svd_success = False
    svd_error = str(e)

svd_success, svd_error if not svd_success else (X_train_svd.shape, X_test_svd.shape, svd.explained_variance_ratio_.sum())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_svd, y_train)
log_reg_pred = log_reg.predict(X_test_svd)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_reg_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, log_reg_pred))

# XGBoost
xgb = XGBClassifier()
xgb.fit(X_train_svd, y_train)
xgb_pred = xgb.predict(X_test_svd)
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_pred))

# Feature Importance from XGBoost (for the original feature set, before SVD)
xgb_feature_importance = xgb.feature_importances_

In [None]:
# Convert date columns to datetime format
data['PO_CREATED_AT'] = pd.to_datetime(data['PO_CREATED_AT'])
data['PO_ARRIVED_AT'] = pd.to_datetime(data['PO_ARRIVED_AT'])

# Create a new feature for on-time delivery (1 if on time, 0 if late)
data['ON_TIME_DELIVERY'] = (data['PO_ARRIVED_AT'] - data['PO_CREATED_AT']).dt.days <= 5
data['ON_TIME_DELIVERY'] = data['ON_TIME_DELIVERY'].astype(int)

# Selecting features and target
features = ['VENDOR_NAME', 'WAREHOUSE_ID', 'SKU', 'MASTER_ID', 'MPN', 'ORDERED_QTY', 'TOTAL_RECEIVED_QTY']
X = data[features]
y = data['ON_TIME_DELIVERY']

# Preprocessing: Encoding categorical variables and scaling numerical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['ORDERED_QTY', 'TOTAL_RECEIVED_QTY']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['VENDOR_NAME', 'WAREHOUSE_ID', 'SKU', 'MASTER_ID', 'MPN'])
    ])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Dimensionality reduction using Truncated SVD
svd = TruncatedSVD(n_components=50)  # Adjust the number of components based on your system
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_svd, y_train)
log_reg_pred = log_reg.predict(X_test_svd)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_reg_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, log_reg_pred))

# XGBoost Model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_svd, y_train)
xgb_pred = xgb.predict(X_test_svd)
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_pred))

# Feature importance from XGBoost (post-SVD feature space)
xgb_feature_importance = xgb.feature_importances_

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, ARDRegression
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import numpy as np

# Load the dataset


# Data preprocessing steps
# ...

# Apply preprocessing, Truncated SVD, and split the dataset
# ...

# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_svd, y_train)
log_reg_pred = log_reg.predict(X_test_svd)

# XGBoost Model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_svd, y_train)
xgb_pred = xgb.predict(X_test_svd)

# ARD Regression Model
ard = ARDRegression()
ard.fit(X_train_svd, y_train)
ard_pred = ard.predict(X_test_svd)
# Convert predictions to binary outcomes
ard_pred = [1 if p > 0.5 else 0 for p in ard_pred]

# Evaluation
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_reg_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, log_reg_pred))

print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_pred))

print("ARD Regression Classification Report:")
print(classification_report(y_test, ard_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, ard_pred))

# Feature Importance Analysis
# XGBoost feature importance
xgb_feature_importance = xgb.feature_importances_
print("XGBoost Feature Importance:")
print(xgb_feature_importance)

# ARDRegression coefficients
ard_coefficients = ard.coef_
print("ARDRegression Coefficients (indicative of feature relevance):")
print(ard_coefficients)

# Note: Interpretation of these values requires mapping back to original feature space if needed