In [3]:
#importing all libraries necessary for machine learning
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import pickle
from datetime import datetime

In [2]:
# i need to load in all 5 datasets which is the one dataset combined

# Attempt to get the script directory; fallback to os.getcwd() if __file__ is not defined
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

# Assuming your notebook is running from "COMP30830-SE-Group11-Dublin-Bike-Sharing-System/app/machine learning",
# we move up two directories to reach the project root.
project_root = os.path.abspath(os.path.join(script_dir, "..", ".."))

# Define the correct path to the 'database' folder inside 'app'
data_folder = os.path.join(project_root, "app", "machine learning")

# Get all CSV file paths that match the pattern in the 'database' folder
file_paths = glob.glob(os.path.join(data_folder, "MachineLearningData_*.csv"))

# Read and concatenate all CSV files into one DataFrame
data = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)

# Handle missing values (drop rows with NaN in lagged features)
data.dropna(inplace=True)

ValueError: No objects to concatenate

In [8]:
# Load the dataset
data = pd.read_csv("bike_weather_data.csv")

In [9]:
# Handle missing values (drop rows with NaN in lagged features)
data.dropna(inplace=True)

In [10]:
# Define features and target
features = ['station_id','temperature', 'humidity', 'pressure', 'hour', 'day']
target = 'num_bikes_available'

X = data[features]
y = data[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error: 8.143022150821498
R² Score: -4.569177093749488e-05


In [12]:
# Display model coefficients
print("\nModel Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")
print(f"Intercept: {model.intercept_}")


Model Coefficients:
station_id: -0.00023217307731367119
temperature: 0.0030231075501709185
humidity: 0.010717418693589288
pressure: -0.002050169756292047
hour: -0.005113680877276946
day: 0.004264426528796764
Intercept: 13.383509842413048


In [13]:
# Save the model to a file
model_filename = "bike_availability_model.joblib"
joblib.dump(model, model_filename)

print(f"Model saved to {model_filename}")

# Save the model to a .pkl file
model_filename = "bike_availability_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {model_filename}")

Model saved to bike_availability_model.joblib
Model saved to bike_availability_model.pkl


# Make a prediction

In [15]:
# Load the saved model
with open("bike_availability_model.pkl", "rb") as file:
    model = pickle.load(file)

# Define new input data for prediction
new_data = pd.DataFrame({
    'station_id': [32],
    'temperature': [20],
    'humidity': [60],
    'pressure': [1002.94],
    'hour': [9],
    'day': [2]  # Example: 0 = Monday, 1 = Tuesday, etc.
})

# Make prediction
prediction = model.predict(new_data)
# Output prediction
print(f"Predicted number of available bikes: {prediction[0]}")

Predicted number of available bikes: 11.985896046344342


# Predice_based_on_weather

In [17]:

# Load the trained model
with open("bike_availability_model.pkl", "rb") as file:
    model = pickle.load(file)

def get_weather_forecast(city, date):
    """Stub function for weather forecast. Returns fixed weather data: REPLACE WITH CALL TO OPENWEATHER API
    """
    return {
        'temperature': 20.0,
        'humidity': 60.0,
        'wind_speed': 10.0,
        'precipitation': 0.0,
        'pressure': 1001.10
    }

def predict_bike_availability(station_id, city, date_str, time_str):
    """Predict the number of available bikes for a given city, date, and time."""
    # Parse input date and time
    date_time = datetime.strptime(f"{date_str} {time_str}", "%Y-%m-%d %H:%M")
    hour = date_time.hour
    day_of_week = date_time.weekday()

    # Use the function for weather forecast
    weather_features = get_weather_forecast(city, date_str)
    
    # Prepare input data for the model
    input_data = pd.DataFrame([{
        'station_id': station_id,
        'temperature': weather_features['temperature'],
        'humidity': weather_features['humidity'],
        'pressure': weather_features['pressure'],
        'hour': hour,
        'day': day_of_week
    }])

    # Make prediction
    prediction = model.predict(input_data)
    return prediction[0]

# Example usage
city = "Dublin"
date_str = "2024-02-25"
time_str = "09:00"
station_id = 50

predicted_bikes = predict_bike_availability(station_id, city, date_str, time_str)
print(f"Predicted number of available bikes in {city} on {date_str} at {time_str}: {predicted_bikes}")


Predicted number of available bikes in Dublin on 2024-02-25 at 09:00: 12.002546949419461
