In [6]:
import os
import sys
parent_dir = os.path.dirname(os.getcwd())
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_dir)
utils_dir = os.path.join(parent_dir, grandparent_dir, "src", "utils")
sys.path.append(utils_dir)

from weighted_accuracy_and_tools import decompose_y, reconstruct_y, weighted_accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

import pandas as pd

In [7]:
X_path= os.path.join("..","..", "data","enriched_input", "X_train.csv")
X = pd.read_csv(X_path, delimiter=',')

y_path= os.path.join("..", "..", "data","enriched_input", "y_train.csv")
y = pd.read_csv(y_path, delimiter=',')

In [8]:
X.set_index("DELIVERY_START", inplace=True)
y.set_index("DELIVERY_START", inplace=True)
X.index = pd.to_datetime(X.index, utc=True)
y.index = pd.to_datetime(y.index, utc=True)
X.shape

(10605, 16)

In [9]:
y_direction, _ = decompose_y(y['spot_id_delta'])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y_direction, test_size=0.2, random_state=42)

In [11]:
# Re-instantiate the individual classifiers
knn = KNeighborsClassifier(n_neighbors=5)
dtree = DecisionTreeClassifier(random_state=42)
logreg = LogisticRegression(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
gradient_boosting = GradientBoostingClassifier(random_state=42)

# Create the extended voting classifier with all models
extended_voting_clf = VotingClassifier(
    estimators=[
        ('knn', knn),
        ('dtree', dtree),
        ('logreg', logreg),
        ('random_forest', random_forest),
        ('gradient_boosting', gradient_boosting)
    ],
    voting='hard'
)

# Train the extended voting classifier on scaled data
extended_voting_clf.fit(X_train, y_train)

# Make predictions and evaluate the extended model
y_pred_extended = extended_voting_clf.predict(X_test)
accuracy_extended = accuracy_score(y_test, y_pred_extended)

In [12]:
accuracy_extended

0.768033946251768

In [13]:
accuracies = {}
models = [knn, dtree, logreg, random_forest, gradient_boosting]
model_names = ['KNN', 'Decision Tree', 'Logistic Regression', 'Random Forest', 'Gradient Boosting']

for model, name in zip(models, model_names):
    if name in ['KNN', 'Logistic Regression']:  # Models that benefit from scaled data
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:  # Models that don't require scaled data
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    accuracies[name] = accuracy_score(y_test, y_pred)

accuracies

{'KNN': 0.7024988213107025,
 'Decision Tree': 0.7392739273927392,
 'Logistic Regression': 0.5464403583215465,
 'Random Forest': 0.8208392267798208,
 'Gradient Boosting': 0.66996699669967}