Reading and preprocessing the data

In [91]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as skl_pre
import sklearn.model_selection as skl_ms
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Import data
path = "training_data_vt2025.csv"
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training_data_vt2025.csv', dtype={"ID": str}).dropna().reset_index(drop=True)
X = data.drop(columns=["increase_stock"])  # Features
Y = data["increase_stock"]  # Output

# Make things not random
np.random.seed(0)
n_fold = 10
cv = skl_ms.KFold(
    n_splits=n_fold, random_state=2, shuffle=True
)  # Cross-validation with 10 folds, use by calling cv.split(X)


Preprocessing the data

In [92]:
def random_split(percent_train=0.8):
    # Split data into training and test set randomly (80% training and 20% test each by default)
    trainI = np.random.choice(
        data.index, size=int(percent_train * len(data)), replace=False
    )
    trainIndex = data.index.isin(trainI)
    trainX = X.iloc[trainIndex]
    trainY = Y.iloc[trainIndex]
    testX = X.iloc[~trainIndex]
    testY = Y.iloc[~trainIndex]
    return trainX, trainY, testX, testY

using Linear descriminat analysis and tuning for various parametes, there is no significant observed changes by tuning various parameters, average accuracy remained same.

In [89]:
lda_params = [
    {"solver": "svd"},
    {"solver": "lsqr", "shrinkage": "auto"},
    {"solver": "eigen", "shrinkage": "auto"}
]

# Perform K-Fold Cross-Validation
for params in lda_params:

    accuracy_scores = []
    all_predictions = []
    all_true_labels = []

    print(f"\n Evaluating LDA with {params}")

    for train_idx, test_idx in cv.split(X):
        # Split into training and test sets
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = Y.iloc[train_idx], Y.iloc[test_idx]

        # Train LDA Model and make prediction
        lda = LinearDiscriminantAnalysis(**params)
        lda.fit(X_train, y_train)
        y_pred = lda.predict(X_test)

        # Store results
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        all_predictions.extend(y_pred)
        all_true_labels.extend(y_test)

    avg_accuracy = np.mean(accuracy_scores)

    print(f"LDA Average Accuracy: {avg_accuracy:.4f}")
    print("LDA Classification Report:\n", classification_report(all_true_labels, all_predictions))
    print("LDA Confusion Matrix:\n", confusion_matrix(all_true_labels, all_predictions))



 Evaluating LDA with {'solver': 'svd'}
LDA Average Accuracy: 0.8456
LDA Classification Report:
                   precision    recall  f1-score   support

high_bike_demand       0.61      0.38      0.47       288
 low_bike_demand       0.87      0.95      0.91      1312

        accuracy                           0.85      1600
       macro avg       0.74      0.66      0.69      1600
    weighted avg       0.83      0.85      0.83      1600

LDA Confusion Matrix:
 [[ 110  178]
 [  69 1243]]

 Evaluating LDA with {'solver': 'lsqr', 'shrinkage': 'auto'}
LDA Average Accuracy: 0.8456
LDA Classification Report:
                   precision    recall  f1-score   support

high_bike_demand       0.61      0.40      0.48       288
 low_bike_demand       0.88      0.94      0.91      1312

        accuracy                           0.85      1600
       macro avg       0.74      0.67      0.70      1600
    weighted avg       0.83      0.85      0.83      1600

LDA Confusion Matrix:
 [[ 116  1

using Quadratic descriminat analysis and tuning for various parametes, best result obtained with regularization parameter = 0.5.

In [90]:


#  QDA parameters to test
qda_params = [
    {'reg_param': 0.0},
    {'reg_param': 0.1},
    {'reg_param': 0.5},
    {'reg_param': 1.0}
]

# Perform QDA with 10-Fold Cross-Validation
for params in qda_params:

    accuracy_scores = []
    all_predictions = []
    all_true_labels = []
    print(f"\n Evaluating QDA with {params}")

    for train_idx, test_idx in cv.split(X):
        # Split into train and test sets
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = Y.iloc[train_idx], Y.iloc[test_idx]

        qda = QuadraticDiscriminantAnalysis(**params)
        qda.fit(X_train, y_train)
        y_pred = qda.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        all_predictions.extend(y_pred)
        all_true_labels.extend(y_test)


    print(f" QDA Average_Accuracy:  {np.mean(accuracy_scores):.4f}")
    print(" QDA Classification _Report: \n", classification_report(all_true_labels, all_predictions))
    print(" QDA Confusion-matrix:\n", confusion_matrix(all_true_labels, all_predictions))



 Evaluating QDA with {'reg_param': 0.0}
 QDA Average_Accuracy:  0.5000
 QDA Classification _Report: 
                   precision    recall  f1-score   support

high_bike_demand       0.21      0.64      0.32       288
 low_bike_demand       0.86      0.47      0.61      1312

        accuracy                           0.50      1600
       macro avg       0.53      0.56      0.46      1600
    weighted avg       0.74      0.50      0.55      1600

 QDA Confusion-matrix:
 [[185 103]
 [697 615]]

 Evaluating QDA with {'reg_param': 0.1}
 QDA Average_Accuracy:  0.8187




 QDA Classification _Report: 
                   precision    recall  f1-score   support

high_bike_demand       0.50      0.84      0.63       288
 low_bike_demand       0.96      0.81      0.88      1312

        accuracy                           0.82      1600
       macro avg       0.73      0.83      0.75      1600
    weighted avg       0.88      0.82      0.83      1600

 QDA Confusion-matrix:
 [[ 243   45]
 [ 245 1067]]

 Evaluating QDA with {'reg_param': 0.5}
 QDA Average_Accuracy:  0.8550
 QDA Classification _Report: 
                   precision    recall  f1-score   support

high_bike_demand       0.58      0.68      0.63       288
 low_bike_demand       0.93      0.89      0.91      1312

        accuracy                           0.85      1600
       macro avg       0.76      0.79      0.77      1600
    weighted avg       0.87      0.85      0.86      1600

 QDA Confusion-matrix:
 [[ 197   91]
 [ 141 1171]]

 Evaluating QDA with {'reg_param': 1.0}
 QDA Average_Accuracy