In [173]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBClassifier
import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

In [174]:
# Load Data
df = pd.read_csv('web_traffic.csv') 
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [175]:
# Create new timestamp
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Month'] = df['Timestamp'].dt.month

In [176]:
# Select features (independent variables)
X = df[['Hour', 'DayOfWeek', 'Month']]

# Set the target variable
# We'll convert 'TrafficCount' to a binary classification: 1 = high traffic, 0 = low traffic
y = (df['TrafficCount'] > df['TrafficCount'].median()).astype(int)

In [177]:
# Keeping the split sequencial
df = df.sort_index()
split_index = int(len(df) * 0.7)
train = df.iloc[:split_index]
test = df.iloc[split_index:]

In [178]:
# Split data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [179]:
# Making sure the split remained sequential 
print("Train range:", train.index.min(), "to", train.index.max())
print("Test range:", test.index.min(), "to", test.index.max())

Train range: 0 to 1954
Test range: 1955 to 2792


In [180]:
train

Unnamed: 0,Timestamp,TrafficCount,Hour,DayOfWeek,Month
0,2020-01-20 00:00:00,487.0,0,0,1
1,2020-01-20 00:30:00,385.0,0,0,1
2,2020-01-20 01:30:00,269.0,1,0,1
3,2020-01-20 02:30:00,517.0,2,0,1
4,2020-01-20 03:30:00,129.0,3,0,1
...,...,...,...,...,...
1950,2020-04-12 15:30:00,1573.0,15,6,4
1951,2020-04-12 16:30:00,2079.0,16,6,4
1952,2020-04-12 17:30:00,2152.0,17,6,4
1953,2020-04-12 18:30:00,3790.0,18,6,4


In [181]:
# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [182]:
print(X_train)
print(y_train)

      Hour  DayOfWeek  Month
1942     7          6      4
1200    10          1      3
2465    23          6      5
911     11          3      2
2228     3          4      4
...    ...        ...    ...
1638    16          0      3
1095     2          4      3
1130    13          5      3
1294     8          5      3
860      8          1      2

[1955 rows x 3 columns]
1942    0
1200    1
2465    0
911     1
2228    0
       ..
1638    1
1095    0
1130    0
1294    0
860     1
Name: TrafficCount, Length: 1955, dtype: int64


In [183]:
print(test)

               Timestamp  TrafficCount  Hour  DayOfWeek  Month
1955 2020-04-12 20:30:00        2878.0    20          6      4
1956 2020-04-12 21:30:00        1663.0    21          6      4
1957 2020-04-12 22:30:00        1633.0    22          6      4
1958 2020-04-12 23:30:00        1651.0    23          6      4
1959 2020-04-13 00:00:00        1651.0     0          0      4
...                  ...           ...   ...        ...    ...
2788 2020-05-17 08:30:00         521.0     8          6      5
2789 2020-05-17 09:30:00        1003.0     9          6      5
2790 2020-05-17 10:30:00        1442.0    10          6      5
2791 2020-05-17 11:30:00        1334.0    11          6      5
2792 2020-05-17 12:30:00         632.0    12          6      5

[838 rows x 5 columns]


In [184]:
# Predict using the test set
y_pred = model.predict(X_test)

# Display model performance metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.80       430
           1       0.79      0.77      0.78       408

    accuracy                           0.79       838
   macro avg       0.79      0.79      0.79       838
weighted avg       0.79      0.79      0.79       838



**Arima**


**LSTM**

In [187]:
traffic = df[['TrafficCount']].values

SEQ_LEN = 10  #verify if 10 works
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

X_lstm, y_lstm = create_sequences(traffic_scaled, SEQ_LEN)

print(X_train_lstm.shape)  
print(y_train_lstm.shape)  

(1948, 10, 1)
(1948, 1)


In [206]:
# Using best found parameters
best_units = 64
best_dropout = 0.2
best_learning_rate = 0.001

# Build the model
model = Sequential()
model.add(Input(shape=(10, 1)))
model.add(LSTM(units=best_units))
model.add(Dropout(rate=best_dropout))
model.add(Dense(1))

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=best_learning_rate),
    loss='mse'
)

# Fit the model
history = model.fit(
    X_train_lstm, y_train_lstm,
    epochs=20,
    validation_split=0.2,
    verbose=1
)

# Evaluate or predict as needed
y_pred = model.predict(X_test_lstm)

Epoch 1/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0220 - val_loss: 0.0064
Epoch 2/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0122 - val_loss: 0.0050
Epoch 3/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0095 - val_loss: 0.0041
Epoch 4/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0083 - val_loss: 0.0038
Epoch 5/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0087 - val_loss: 0.0034
Epoch 6/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0076 - val_loss: 0.0032
Epoch 7/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0084 - val_loss: 0.0030
Epoch 8/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0064 - val_loss: 0.0031
Epoch 9/20
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [208]:
# Assume y_test_lstm contains your true values and y_pred your model's predictions
mse_lstm = mean_squared_error(y_test_lstm, y_pred)
rmse_lstm = np.sqrt(mse_lstm)
print("RMSE:", rmse_lstm)

RMSE: 0.04133585585641599
