In [72]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle
import os

In [73]:
# Read CSV files into pandas DataFrames
stations_dynamic_df = pd.read_csv("ExportedData/stations_dynamic.csv")
stations_static_df = pd.read_csv("ExportedData/stations_static.csv")
weather_data_df = pd.read_csv("ExportedData/weather_data.csv")

# Display the first few rows of each DataFrame to understand the data structure
print("Stations Dynamic Data:")
print(stations_dynamic_df.head())
print("\nStations Static Data:")
print(stations_static_df.head())
print("\nWeather Data:")
print(weather_data_df.head())

# Perform some simple analysis
# Example: Calculate the average temperature from weather data
average_temperature = weather_data_df["temperature"].mean()
print("\nAverage Temperature:", average_temperature)

# Example: Count the number of bike stations
num_stations = len(stations_static_df)
print("Number of Bike Stations:", num_stations)

# Example: Find the most common main events in weather data
common_main_events = weather_data_df["main_event"].value_counts().head()
print("\nMost Common Main Events:")
print(common_main_events)


Stations Dynamic Data:
   id  number                  name  banking  bonus status  \
0   1      42      SMITHFIELD NORTH        0      0   OPEN   
1   2      30  PARNELL SQUARE NORTH        0      0   OPEN   
2   3      54        CLONMEL STREET        0      0   OPEN   
3   4     108         AVONDALE ROAD        0      0   OPEN   
4   5      20     JAMES STREET EAST        0      0   OPEN   

           last_update           api_update  available_bikes  \
0  2024-02-21 00:59:33  2024-02-21 01:00:25               30   
1  2024-02-21 00:51:51  2024-02-21 01:00:25                0   
2  2024-02-21 00:50:28  2024-02-21 01:00:25                3   
3  2024-02-21 00:51:03  2024-02-21 01:00:25               25   
4  2024-02-21 00:52:14  2024-02-21 01:00:25                0   

   available_bike_stands  bike_stands  
0                      0           30  
1                     20           20  
2                     30           33  
3                     10           35  
4                  

In [74]:
stations_dynamic_df.drop(["banking", "bonus", "status", "api_update"], axis=1, inplace=True)
stations_dynamic_df['last_update'] = pd.to_datetime(stations_dynamic_df['last_update'])
stations_dynamic_df['last_update'] = stations_dynamic_df['last_update'].dt.round('H')

weather_data_df = weather_data_df.dropna(subset=["main_event"]) #drop problematic rows
weather_data_df = weather_data_df[weather_data_df["temperature"] <= 60]
weather_data_df.drop(["description"], axis=1, inplace=True)


weather_data_df['timestamp'] = pd.to_datetime(weather_data_df['timestamp'])
weather_data_df['timestamp'] = weather_data_df['timestamp'].dt.round('H')

  stations_dynamic_df['last_update'] = stations_dynamic_df['last_update'].dt.round('H')
  weather_data_df['timestamp'] = weather_data_df['timestamp'].dt.round('H')


In [75]:
stations_dynamic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091094 entries, 0 to 1091093
Data columns (total 7 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   id                     1091094 non-null  int64         
 1   number                 1091094 non-null  int64         
 2   name                   1091094 non-null  object        
 3   last_update            1091094 non-null  datetime64[ns]
 4   available_bikes        1091094 non-null  int64         
 5   available_bike_stands  1091094 non-null  int64         
 6   bike_stands            1091094 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 58.3+ MB


In [76]:
weather_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9109 entries, 11 to 9119
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             9109 non-null   int64         
 1   timestamp      9109 non-null   datetime64[ns]
 2   temperature    9109 non-null   float64       
 3   main_event     9109 non-null   object        
 4   rain_hour_day  9109 non-null   float64       
 5   feels_like     9109 non-null   float64       
 6   humidity       9109 non-null   int64         
 7   wind_speed     9109 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 640.5+ KB


In [77]:
# One-hot encode main_event and description
weather_data_df = pd.get_dummies(weather_data_df, columns=['main_event'])

# Confirming changes
weather_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9109 entries, 11 to 9119
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  9109 non-null   int64         
 1   timestamp           9109 non-null   datetime64[ns]
 2   temperature         9109 non-null   float64       
 3   rain_hour_day       9109 non-null   float64       
 4   feels_like          9109 non-null   float64       
 5   humidity            9109 non-null   int64         
 6   wind_speed          9109 non-null   float64       
 7   main_event_Clear    9109 non-null   bool          
 8   main_event_Clouds   9109 non-null   bool          
 9   main_event_Drizzle  9109 non-null   bool          
 10  main_event_Mist     9109 non-null   bool          
 11  main_event_Rain     9109 non-null   bool          
 12  main_event_Snow     9109 non-null   bool          
dtypes: bool(6), datetime64[ns](1), float64(4), int64(2)


In [78]:
print("Stations Dynamic Data:")
print(stations_dynamic_df.head(150))
print("\nWeather Data:")
print(weather_data_df.head(150))

Stations Dynamic Data:
      id  number                  name         last_update  available_bikes  \
0      1      42      SMITHFIELD NORTH 2024-02-21 01:00:00               30   
1      2      30  PARNELL SQUARE NORTH 2024-02-21 01:00:00                0   
2      3      54        CLONMEL STREET 2024-02-21 01:00:00                3   
3      4     108         AVONDALE ROAD 2024-02-21 01:00:00               25   
4      5      20     JAMES STREET EAST 2024-02-21 01:00:00                0   
..   ...     ...                   ...                 ...              ...   
145  146     113  MERRION SQUARE SOUTH 2024-02-21 01:00:00                0   
146  147      91       SOUTH DOCK ROAD 2024-02-21 01:00:00               21   
147  148      99             CITY QUAY 2024-02-21 01:00:00               15   
148  149       9      EXCHEQUER STREET 2024-02-21 01:00:00                0   
149  150      67             THE POINT 2024-02-21 01:00:00               10   

     available_bike_stands  

In [79]:
merged_df = pd.merge_asof(stations_dynamic_df.sort_values('last_update'),
                          weather_data_df.sort_values('timestamp'),
                          left_on='last_update',
                          right_on='timestamp',
                          direction='nearest')

In [80]:
merged_df.drop(['timestamp'], axis=1, inplace=True)

In [81]:
print(merged_df.head(500))

     id_x  number                  name         last_update  available_bikes  \
0       1      42      SMITHFIELD NORTH 2024-02-21 01:00:00               30   
1    1030     108         AVONDALE ROAD 2024-02-21 01:00:00               25   
2    1029      54        CLONMEL STREET 2024-02-21 01:00:00                3   
3    1028      30  PARNELL SQUARE NORTH 2024-02-21 01:00:00                0   
4    1027      42      SMITHFIELD NORTH 2024-02-21 01:00:00               30   
..    ...     ...                   ...                 ...              ...   
495  1523      97       KILMAINHAM GAOL 2024-02-21 01:00:00               40   
496  1522       5      CHARLEMONT PLACE 2024-02-21 01:00:00               37   
497  1519     116            BROADSTONE 2024-02-21 01:00:00               11   
498  1516      99             CITY QUAY 2024-02-21 01:00:00               15   
499  1515      91       SOUTH DOCK ROAD 2024-02-21 01:00:00               21   

     available_bike_stands  bike_stands

In [82]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091094 entries, 0 to 1091093
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   id_x                   1091094 non-null  int64         
 1   number                 1091094 non-null  int64         
 2   name                   1091094 non-null  object        
 3   last_update            1091094 non-null  datetime64[ns]
 4   available_bikes        1091094 non-null  int64         
 5   available_bike_stands  1091094 non-null  int64         
 6   bike_stands            1091094 non-null  int64         
 7   id_y                   1091094 non-null  int64         
 8   temperature            1091094 non-null  float64       
 9   rain_hour_day          1091094 non-null  float64       
 10  feels_like             1091094 non-null  float64       
 11  humidity               1091094 non-null  int64         
 12  wind_speed             10910

In [83]:
def predict_bike_availability(dataframe, number):
    # Filter DataFrame based on the provided station number
    station_df = dataframe[dataframe['number'] == number]
    
    # Features and target variable
    X = station_df.drop(columns=['available_bikes', 'last_update', 'id_x', 'id_y', 'name', 'number'])
    y = station_df['available_bikes']
    
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=42)
    
    # Initialize and fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predicting on the test set
    y_pred = model.predict(X_test)
    
    # Model evaluation
    # mse = mean_squared_error(y_test, y_pred)
    # rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # print("Mean Squared Error:", mse)
    # print("Root Mean Squared Error:", rmse)
    print("R^2 Score:", r2)
    
    # Returning the trained model, coefficients, and intercept
    return model, model.coef_, model.intercept_


In [84]:
# List of all station numbers 
station_numbers = merged_df['number'].unique()

# Loop through each station number
i = 0
while i < len(station_numbers):
    # Get the current station number
    number = station_numbers[i]
    
    # Filter merged_df to get the row for the current station number
    station_row = merged_df[merged_df['number'] == number].iloc[0]  # Assuming each number corresponds to a unique station
    
    # Get the name of the station
    name = station_row['name']
    
    # Call the predict_bike_availability function for the current station number
    print(f"Station Number: {number}, Station Name: {name}")
    trained_model, coefficients, intercept = predict_bike_availability(merged_df, number)
    
    # Increment the index to move to the next station number
    i += 1

Station Number: 42, Station Name: SMITHFIELD NORTH
R^2 Score: 0.9987622139429126
Station Number: 108, Station Name: AVONDALE ROAD
R^2 Score: 0.9996992008433023
Station Number: 54, Station Name: CLONMEL STREET
R^2 Score: 0.9992315973013252
Station Number: 30, Station Name: PARNELL SQUARE NORTH
R^2 Score: 0.4088721653663133
Station Number: 88, Station Name: BLACKHALL PLACE
R^2 Score: 0.9993802664230343
Station Number: 21, Station Name: LEINSTER STREET SOUTH
R^2 Score: 0.9994950418451146
Station Number: 92, Station Name: HEUSTON BRIDGE (NORTH)
R^2 Score: 0.9998716963714894
Station Number: 83, Station Name: EMMET ROAD
R^2 Score: 0.9989150567824596
Station Number: 39, Station Name: WILTON TERRACE
R^2 Score: 0.9980540938154983
Station Number: 20, Station Name: JAMES STREET EAST
R^2 Score: 0.9996744989705384
Station Number: 28, Station Name: MOUNTJOY SQUARE WEST
R^2 Score: 0.9996474327583483
Station Number: 29, Station Name: ORMOND QUAY UPPER
R^2 Score: 0.9986818242855903
Station Number: 40, 

In [85]:
folder_name = "pickles"

def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Iterate over each station number
for number in merged_df['number'].unique():
    # Filter the DataFrame for the current station number
    station_df = merged_df[merged_df['number'] == number]
    
    # Train the model for the current station
    trained_model, _, _ = predict_bike_availability(station_df, number)
    
    # Define the filename including the folder path
    filename = os.path.join(folder_name, f'linear_regression_model_station_{number}.pkl')
    
    # Save the trained model object as a pickle file
    save_model(trained_model, filename)

R^2 Score: 0.9987622139429126
R^2 Score: 0.9996992008433023
R^2 Score: 0.9992315973013252
R^2 Score: 0.4088721653663133
R^2 Score: 0.9993802664230343
R^2 Score: 0.9994950418451146
R^2 Score: 0.9998716963714894
R^2 Score: 0.9989150567824596
R^2 Score: 0.9980540938154983
R^2 Score: 0.9996744989705384
R^2 Score: 0.9996474327583483
R^2 Score: 0.9986818242855903
R^2 Score: 0.9995873906017734
R^2 Score: 0.9991003356060395
R^2 Score: 0.997643297325462
R^2 Score: 0.9998037785446872
R^2 Score: 0.9978154255964078
R^2 Score: 0.9997170590029028
R^2 Score: 0.999933012671789
R^2 Score: 0.9983214362564337
R^2 Score: 0.9989091963144962
R^2 Score: 0.9998471886051027
R^2 Score: 0.999437422104936
R^2 Score: 0.9986366735237407
R^2 Score: 0.9996028207680561
R^2 Score: 0.9976756647906116
R^2 Score: 0.9993695256971753
R^2 Score: 0.9989381210217585
R^2 Score: 0.9992118975543537
R^2 Score: 0.5059976651393172
R^2 Score: 0.9991602821755856
R^2 Score: 0.9981555543234379
R^2 Score: 0.9997993474790101
R^2 Score: 0.