In [None]:
# Importing necessary modules
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Loading the dataset and filter out rows with passenger count less than 0.
url = 'https://raw.githubusercontent.com/atikagondal/Assignment-2-dave3625-202323/main/Ruter-data.csv' # If you are experiencing URLError, try changing downloading the Ruter_data.csv file and replace the url with that instead.
data = pd.read_csv(url, sep=';', engine='python')
data = data[data['Passasjerer_Ombord'] > 0]

# Convert date to datetime with format day/month/year
data['Dato'] = pd.to_datetime(data['Dato'], format='%d/%m/%Y')

# Define a reference date to calculate
reference_date = pd.to_datetime('01/01/2000', format='%d/%m/%Y')

# Calculate the number of days since the reference date
data['Days_Since_Reference'] = (data['Dato'] - reference_date).dt.days

# One-hot encode the 'Linjenavn'
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_linjenavn = encoder.fit_transform(data['Linjenavn'].values.reshape(-1, 1))

# Combine the one-hot encoded 'Linjenavn' with Days_Since_Reference
X = np.hstack([encoded_linjenavn, data['Days_Since_Reference'].values.reshape(-1, 1)])
y = data['Passasjerer_Ombord']

# Split our test and target set into 20/80 split where 20% is used for testing while the remaining 80% is used for training.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Training our Linear Regression algorithm with the train/test -split and then predict the value using X_train set.
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred_train = linreg.predict(X_train)

# Evaluate the prediction with the following codes to check if the model is any good
mae = mean_absolute_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print(f"Mean Absolute Error: {mae}")
print(f"R2 Score: {r2}")

# A function for handling date and bus inputs and predicts the number of passengers based on these inputs.
def predict_passengers(date_str, linjenavn):
    date_obj = pd.to_datetime(date_str, format='%d/%m/%Y')
    days_since_reference = (date_obj - reference_date).days
    
    # One-hot encode the provided linjenavn
    encoded_linje = encoder.transform(np.array([[linjenavn]]))
    
    # Combine the one-hot encoded 'Linjenavn' with days_since_reference
    features = np.hstack([encoded_linje, np.array([[days_since_reference]])])
    
    predicted_passengers = linreg.predict(features)
    return int(predicted_passengers[0])

# Test the function with a specific 'Date (FORMAT = 'DD/MM/YYYY')' and 'Linjenavn' from our dataset.
print(predict_passengers('03/08/2023', '450'))