In [51]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn import tree

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import graphviz

import sklearn.discriminant_analysis as skl_da

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
# Data pre-processing and visualisation

# 1. Load the dataset
df = pd.read_csv('training_data_fall2024.csv')

# 2. replacing the missing values by 0 (if there is missing value)
df.fillna(method='ffill', inplace=False)

# 3. lable encoding: Map 'low_bike_demand' to 0 and 'high_bike_demand' to 1
df['increase_stock'] = df['increase_stock'].map({'low_bike_demand': 0, 'high_bike_demand': 1})

# 4. feature selection: based on the correlation between the features we choose the required features
correlation_matrix = df.corr(method='pearson') 
# print(correlation_matrix)  

# snow might have a constant value = 0
is_constant = df['snow'].nunique() == 1  
if(is_constant == True):
    df = df.drop(columns=['snow']) 

# it also a good correlation between day_of_week and weekday.
df = df.drop(columns=['weekday'])  

# it is also a strong correlation between summertime and temp
df = df.drop(columns=['summertime']) 
# it is also a reasonable correlation between temp and dew
df = df.drop(columns=['dew']) 

# 5. Normalize Numerical Features
numerical_features = ['temp', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility']

# Initialize the scaler : subtracting the mean and dividing by the standard deviation
scaler = StandardScaler()

# Fit the scaler on the numerical features and transform
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 6. Cyclic encoding for cyclic features
hour = df['hour_of_day']
hour_in_radians = hour * (2 * np.pi / 24)
df['hour_sin'] = np.sin(hour_in_radians)
df['hour_cos'] = np.cos(hour_in_radians)

day_of_week = df['day_of_week']
day_in_radians = day_of_week * (2 * np.pi / 7)
df['weekday_sin'] = np.sin(day_in_radians)
df['weekday_cos'] = np.cos(day_in_radians)

month1 = df['month']
month_in_radians = month1 * (2 * np.pi / 12)
df['month_sin'] = np.sin(month_in_radians)
df['month_cos'] = np.cos(month_in_radians)


# Drop the original columns
df.drop(['hour_of_day', 'day_of_week', 'month'], axis=1, inplace=True)

# Display the transformed DataFrame
df[['increase_stock', 'month_cos']] = df[['month_cos', 'increase_stock']].values

# Swap the column names
df.columns.values[8], df.columns.values[14] = df.columns.values[14], df.columns.values[8]

# print(df.head())
# print(df.info())

In [54]:

# y = df['increase_stock'] # Target variable 
# X = df.drop('increase_stock', axis=1)  # Features 

# Split into training and test set
X_train = df.iloc[:1280, :14]
y_train = df.iloc[:1280, 14:]
X_test = df.iloc[1280:, :14]
y_test = df.iloc[1280:, 14:]

# Convert Y_train and Y_test to 1D arrays 
y_train = y_train.values.ravel().astype(int)
y_test = y_test.values.ravel().astype(int)

In [None]:
# 1. Linear Discriminant Analysis (LDA)
def implement_lda():
    #Initialize LDA with simpler parameters
    lda = skl_da.LinearDiscriminantAnalysis(solver='svd')  

    param_grid = {
        'solver': ['svd', 'lsqr'],  
    }

    #perform grid search
    grid_search = GridSearchCV(lda, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    #get best model
    best_lda = grid_search.best_estimator_

    # Predictions
    predict_prob = best_lda.predict_proba(X_test)
    prediction = np.where(predict_prob[:, 0]>=0.5, 0, 1)  # Using numeric labels instead of strings

    # Print results
    print("\nLDA Results:")
    print('Best parameters:', grid_search.best_params_)
    print('The class order in the model:', best_lda.classes_)
    print('\nFirst five predicted probabilities:')
    with np.printoptions(suppress=True, precision=3):
        print(predict_prob[0:5])

    print("\nFirst five predictions:")
    print(prediction[0:5])

    print("\nConfusion matrix:")
    print(pd.crosstab(prediction, y_test))

    accuracy = np.mean(prediction == y_test)
    print(f"\nAccuracy: {accuracy:.3f}")

    return best_lda, accuracy

# 2. Quadratic Discriminant Analysis (QDA)
def implement_qda():
    #Initialize QDA
    qda = skl_da.QuadraticDiscriminantAnalysis()

    param_grid = {
        'reg_param': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]  # Simplified parameter grid
    }

    #perform grid search
    grid_search = GridSearchCV(qda, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    #get best model
    best_qda = grid_search.best_estimator_

    # Predictions
    predict_prob = best_qda.predict_proba(X_test)
    prediction = np.where(predict_prob[:, 0]>=0.5, 0, 1)  # Using numeric labels instead of strings

    # Print results
    print("\nQDA Results:")
    print('Best parameters:', grid_search.best_params_)
    print('The class order in the model:', best_qda.classes_)
    print('\nFirst five predicted probabilities:')
    with np.printoptions(suppress=True, precision=3):
        print(predict_prob[0:5])

    print("\nFirst five predictions:")
    print(prediction[0:5])

    print("\nConfusion matrix:")
    print(pd.crosstab(prediction, y_test))

    accuracy = np.mean(prediction == y_test)
    print(f"\nAccuracy: {accuracy:.3f}")

    return best_qda, accuracy

#run both models and compare
best_lda, lda_accuracy = implement_lda()
best_qda, qda_accuracy = implement_qda()

#Compare models
print("\nModel Comparison:")
print(f"LDA Accuracy: {lda_accuracy:.3f}")
print(f"QDA Accuracy: {qda_accuracy:.3f}")

# Print which model performed better
if lda_accuracy > qda_accuracy:
    print("LDA performed better")
elif qda_accuracy > lda_accuracy:
    print("QDA performed better")
else:
    print("Both models performed equally")

In [None]:
model = tree.DecisionTreeClassifier()
#model.fit(X=X_train, y=y_train)
#print(model.feature_names_in_)
#print(model.classes_)

#Grid

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
model = grid_search.best_estimator_
print(model)

#Tree

feature_names = [str(name) for name in X_train.columns]
class_names = [str(name) for name in model.classes_]
#dot_data = tree.export_graphviz(model, out_file=None, feature_names= X_train.columns, class_names=model.classes_, filled=True, rounded=True, leaves_parallel=True, proportion=True)
dot_data = tree.export_graphviz(model, out_file=None, feature_names= X_train.columns, class_names=class_names, filled=True, rounded=True, leaves_parallel=True, proportion=True)
graph = graphviz.Source(dot_data)
graph


#Testing

# X_test = test.drop(columns=['increase_stock'])
#X_test = pd.get_dummies(X_test, columns = ['Store7'])
# y_test = test['increase_stock']
y_predict = model.predict(X_test)

accuracy = accuracy_score(y_test, y_predict)
print('Accuracy rate is %2f' % accuracy)

precision = precision_score(y_test, y_predict)
print('Precision is %2f' % precision)

recall = recall_score(y_test, y_predict)
print('Recall is %2f' % recall)

#f1score = f1_score(y_test, y_predict)
#print('f1-Score is %2f' % f1score)

pd.crosstab(y_predict, y_test) 

# Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_predict)
print("Confusion Matrix:\n", conf_matrix)

# Per-Class Accuracy
per_class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# Display Per-Class Accuracy
for idx, accuracy in enumerate(per_class_accuracy):
    print(f"Accuracy for Class {idx}: {accuracy:.2f}")

# Calculate precision for each class
precision_per_class = conf_matrix.diagonal() / conf_matrix.sum(axis=0)

# Display precision for each class
for idx, precision in enumerate(precision_per_class):
    print(f"Precision for Class {idx}: {precision:.2f}")


# Calculate Recall for each class
recall_per_class = conf_matrix.diagonal() / conf_matrix.sum(axis=0)

# Display Recall for each class
for idx, recall in enumerate(recall_per_class):
    print(f"Recall for Class {idx}: {recall:.2f}")


# Calculate f1-score for each class
f1_per_class = conf_matrix.diagonal() / conf_matrix.sum(axis=0)

# Display f1-score for each class
for idx, f1_score in enumerate(f1_per_class):
    print(f"f1-score for Class {idx}: {f1_score:.2f}")

# Generate and print the classification report
report = classification_report(y_test, y_predict)


print(report)

In [None]:
# Create a k-NN classifier  
knn = KNeighborsClassifier()  

# Define the grid of k values to search  
param_grids = {
                'n_neighbors': np.arange(1, 51), 
                'weights': ['uniform', 'distance'],
                'metric':['euclidean', 'manhattan', 'minkowski'] 
                }  

# Create the grid search object  
grid_search = GridSearchCV(estimator=knn, param_grid=param_grids, cv=5, scoring='accuracy', n_jobs=-1)  

# Fit the grid search to the training data  
grid_search.fit(X_train, y_train)  

# Output the best parameters and scores  
print("Best parameters:", grid_search.best_params_)  
print("Best cross-validation score:", grid_search.best_score_)  

# Best model based on grid search  
best_knn = grid_search.best_estimator_ 

# Make predictions on the test set  
y_pred = best_knn.predict(X_test)  

# Print classification report and confusion matrix for model evaluation  
print("Classification Report:")  
print(classification_report(y_test, y_pred))  

print("Confusion Matrix:")  
print(confusion_matrix(y_test, y_pred))

In [None]:
# Logistic Regression model

log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Define the grid of hyperparameters
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # Required for 'l1' penalty
}

# Nested Cross-Validation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

nested_scores = []
best_models = []


for train_idx, val_idx in outer_cv.split(X_train, y_train):
    # Outer train-validation split
    X_outer_train, X_outer_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_outer_train, y_outer_val = y_train[train_idx], y_train[val_idx]
    
    # Inner cross-validation for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=log_reg,
        param_grid=param_grid,
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_outer_train, y_outer_train)
    logitic_best_model = grid_search.best_estimator_
    best_models.append(logitic_best_model)
    
    # Evaluate the best model on the validation set of the outer loop
    val_predictions = logitic_best_model.predict(X_outer_val)
    val_accuracy = accuracy_score(y_outer_val, val_predictions)
    nested_scores.append(val_accuracy)

# Summary of nested cross-validation
print(f"Nested Cross-Validation Scores (on training set): {nested_scores}")
print(f"Mean Nested Cross-Validation Score: {np.mean(nested_scores)}")

# Final evaluation on the test set
final_model = best_models[np.argmax(nested_scores)]  # Select the best model from nested CV
test_predictions = final_model.predict(X_test)

#print("\nFinal Evaluation on Test Set:")
#print("Classification Report:")
print(classification_report(y_test, test_predictions))
#print("Confusion Matrix:")
#print(confusion_matrix(y_test, test_predictions))
print("Test Accuracy:", accuracy_score(y_test, test_predictions))

In [None]:
# Implement the Naive Models  
class AlwaysHighDemandClassifier:  
    def predict(self, X):  
        return [1] * len(X)  

class AlwaysLowDemandClassifier:  
    def predict(self, X):  
        return [0] * len(X)  


# Evaluate the Naive Models  
# Always predict high  
naive_high = AlwaysHighDemandClassifier()  
naive_high_predictions = naive_high.predict(X_test)  
print("Always Predict High Results:")  
print(f"Accuracy: {accuracy_score(y_test, naive_high_predictions)}")  
print(classification_report(y_test, naive_high_predictions))  

# Always predict low  
naive_low = AlwaysLowDemandClassifier()  
naive_low_predictions = naive_low.predict(X_test)  
print("Always Predict Low Results:")  
print(f"Accuracy: {accuracy_score(y_test, naive_low_predictions)}")  
print(classification_report(y_test, naive_low_predictions))  

In [None]:
# Model Prediction

# Load the dataset
dff = pd.read_csv('test_data_fall2024.csv')

#replacing the missing values by 0 (if there is missing value)
dff.fillna(method='ffill', inplace=False)

# snow might have a constant value = 0
is_constant = dff['snow'].nunique() == 1  
if(is_constant == True):
    dff = dff.drop(columns=['snow']) 
#print(dff.info()) 

# # it also a good correlation between day_of_week and weekday. so, we can use one of them. 
# # it's better to use day_of_week. because it is more specific
dff = dff.drop(columns=['weekday']) 

# # it is also a strong correlation between summertime and temp
# # we can choose the temp because we can expand the final result and conclude if it is summer or not based on the temp.
dff = dff.drop(columns=['summertime']) 
# # it is also a reasonable correlation between temp and dew
# # so i eliminated the dew
dff = dff.drop(columns=['dew']) 

# 1. Normalize Numerical Features
# List of numerical features to normalize (already identified)
numerical_features2 = ['temp', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility']

# Initialize the scaler : subtracting the mean and dividing by the standard deviation
# return a transformed dataset which their mean is 0 and the standard deviation is 1
scaler = StandardScaler()
#print(scaler)
# Fit the scaler on the numerical features and transform
dff[numerical_features2] = scaler.fit_transform(dff[numerical_features])

# 2. Cyclic encoding for cyclic features
hour = dff['hour_of_day']
hour_in_radians = hour * (2 * np.pi / 24)
dff['hour_sin'] = np.sin(hour_in_radians)
dff['hour_cos'] = np.cos(hour_in_radians)

day_of_week = dff['day_of_week']
day_in_radians = day_of_week * (2 * np.pi / 7)
dff['weekday_sin'] = np.sin(day_in_radians)
dff['weekday_cos'] = np.cos(day_in_radians)

month1 = dff['month']
month_in_radians = month1 * (2 * np.pi / 12)
dff['month_sin'] = np.sin(month_in_radians)
dff['month_cos'] = np.cos(month_in_radians)

# Drop the original columns
dff.drop(['hour_of_day', 'day_of_week', 'month'], axis=1, inplace=True)

dff.columns = X_train.columns
dff = dff[X_train.columns]

# Display the transformed DataFrame
# df[['increase_stock', 'month_cos']] = df[['month_cos', 'increase_stock']].values

# # Swap the column names
# df.columns.values[8], df.columns.values[14] = df.columns.values[14], df.columns.values[8]


Xx_test = dff.iloc[:, :]


predictions = logitic_best_model.predict(Xx_test).astype(int)


print("\nFirst five predictions:")
print(predictions[0:5])


#verify the number of predictions
expected_length = 400
actual_length = len(predictions)
if actual_length != expected_length:
    print(f"Warning: Expected {expected_length} predictions, but got {actual_length}.")

#convert predictions to a single-row DataFrame
predictions_dff = pd.DataFrame([predictions])

#save the predictions to 'predictions.csv'
predictions_dff.to_csv('predictions.csv', header=False, index=False)