LINEAR REGRESSION AND RANDOM FOREST

In [2]:
# FlowCastAI - Traffic Volume Prediction
# --------------------------------------
# Models used: Linear Regression, Random Forest

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load datasets
traffic_df = pd.read_csv('../datasets_cleaned/trafficData.csv')
weather_df = pd.read_csv('../datasets_cleaned/weatherData.csv')

# Convert date and hour columns to usable format
traffic_df['date'] = pd.to_datetime(traffic_df[['year', 'month', 'day']])

# Clean and convert weather dates safely
weather_df['date'] = weather_df['date'].astype(str).str.strip()
weather_df['date'] = pd.to_datetime(weather_df['date'], errors='coerce', dayfirst=True)

In [6]:
print(weather_df.head())
print(weather_df.shape)

   lat_rounded  lon_rounded       date  daily_rain  max_temp  min_temp
0       -32.65       151.85 2016-01-06         4.8      19.0       9.6
1       -32.65       151.85 2016-06-06        24.8      18.4      12.5
2       -32.65       151.85 2016-06-06        24.8      18.4      12.5
3       -33.80       151.00        NaT         0.0      32.6      20.4
4       -32.65       151.85 2015-11-10         0.0      28.2      13.0
(3465422, 6)


In [7]:
print(traffic_df.head())
print(traffic_df.shape)

   station_key  traffic_direction_seq  cardinal_direction_seq  \
0     15934005                      0                       5   
1     15934004                      0                       5   
2     15934005                      0                       5   
3        57052                      0                       7   
4     15934004                      0                       5   

   classification_seq  year  month  day  day_of_week  public_holiday  \
0                   2  2016      6    1            3           False   
1                   2  2016      6    6            1           False   
2                   2  2016      6    6            1           False   
3                   3  2011      2   17            4           False   
4                   2  2015     10   11            7           False   

   school_holiday  ...  hour_15  hour_16  hour_17  hour_18  hour_19  hour_20  \
0           False  ...        0        2        0        0        0        0   
1           Fals

In [5]:
# Merge datasets on the date
df = pd.merge(traffic_df, weather_df, on='date', how='left')

# Drop incomplete records
df.dropna(subset=['traffic_volume', 'daily_rain', 'max_temp', 'min_temp'], inplace=True)

# Select input features and target
feature_cols = ['hour', 'day_of_week', 'public_holiday', 'school_holiday', 'daily_rain', 'max_temp', 'min_temp']
X = df[feature_cols]
y = df['traffic_volume']

# Convert categorical binary fields to dummy variables
X = pd.get_dummies(X, columns=['public_holiday', 'school_holiday'], drop_first=True)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- Train models ----

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# ---- Evaluation ----

def get_scores(name, y_true, y_pred):
    return {
        'Model': name,
        'R2 Score': round(r2_score(y_true, y_pred), 4),
        'MSE': round(mean_squared_error(y_true, y_pred), 2)
    }

results = [
    get_scores("Linear Regression", y_test, y_pred_lr),
    get_scores("Random Forest", y_test, y_pred_rf)
]

results_df = pd.DataFrame(results)

# Print results
print("Model Evaluation Summary:")
print(results_df)

MemoryError: Unable to allocate 197. GiB for an array with shape (33, 799833965) and data type int64