In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from matplotlib import pyplot as plt

In [9]:
def prepare_data(data, window_size):
  features = []
  for i in range(len(data) - window_size + 1):
    features.append(data.iloc[i:i+window_size])
  labels = data.iloc[window_size:].values
  return features, labels


In [10]:
data = pd.read_csv('/home/dotronghiep/Downloads/traffic.txt', sep=',', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,852,853,854,855,856,857,858,859,860,861
0,0.0048,0.0146,0.0289,0.0142,0.0064,0.0232,0.0162,0.0242,0.0341,0.0375,...,0.0051,0.0051,0.0074,0.0079,0.0051,0.0051,0.0339,0.0051,0.01,0.0121
1,0.0072,0.0148,0.035,0.0174,0.0084,0.024,0.0201,0.0338,0.0434,0.0381,...,0.0036,0.0036,0.0107,0.0058,0.0036,0.0036,0.0348,0.0036,0.0087,0.0136
2,0.004,0.0101,0.0267,0.0124,0.0049,0.017,0.0127,0.0255,0.0332,0.0309,...,0.003,0.003,0.0043,0.005,0.003,0.003,0.0327,0.003,0.0061,0.0107
3,0.0039,0.006,0.0218,0.009,0.0029,0.0118,0.0088,0.0163,0.0211,0.0199,...,0.0033,0.0033,0.0019,0.0052,0.0033,0.0033,0.0292,0.0033,0.004,0.0071
4,0.0042,0.0055,0.0191,0.0082,0.0024,0.0095,0.0064,0.0087,0.0144,0.0226,...,0.0049,0.0049,0.0011,0.0071,0.0049,0.0049,0.0264,0.0049,0.004,0.0039


In [11]:
data.shape

(17544, 862)

In [12]:
data = data[:5000]
data.shape

(5000, 862)

In [13]:
n_features = len(data.columns)
window_size = 10
features, labels = prepare_data(data, window_size)
features = np.array(features)

n_samples, n_timesteps, n_features = features.shape
features = features.reshape((n_samples, n_timesteps * n_features))

features = features[:-1]

In [14]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

models = {
    "Linear Regression": GridSearchCV(LinearRegression(), cv=5, param_grid={"fit_intercept": [True, False]}),
    "Random Forest": GridSearchCV(RandomForestRegressor(), cv=5, param_grid={"n_estimators": [10, 20], "max_depth": [10, 15]}),
    "SVR": GridSearchCV(SVR(), cv=5, param_grid={"C": [0.1, 1], "kernel": ['linear', 'rbf']})
}

In [None]:
for name, model in models.items():
    model.fit(features_train, labels_train)
    print(f"Best parameters for {name}: {model.best_params_}")

In [None]:
for name, model in models.items():
    predictions = model.predict(features_test)
    rmse = np.sqrt(mean_squared_error(labels_test, predictions))
    mae = mean_absolute_error(labels_test, predictions)
    r2 = r2_score(labels_test, predictions)
    print(f"{name} - RMSE: {rmse}, MAE: {mae}, R2: {r2}")

In [None]:
for name, model in models.items():
    predictions = model.predict(features_test)
    plt.figure(figsize=(10, 6))
    plt.plot(labels_test, color='blue', label='Actual')
    plt.plot(predictions, color='red', label='Predicted')
    plt.title(f"{name} - Actual vs Predicted")
    plt.legend()
    plt.show()