In [16]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

from catboost import CatBoostRegressor

import time


In [14]:

df = pd.read_csv('dane/2023/curr_lct_ul.csv',
                   sep=',',
                   header=0
                  )




# Wybór parametru docelowego (bytes_sec)
target = 'bytes_sec'

# Feature engineering - ekstrakcja cech czasowych
df['dtime'] = pd.to_datetime(df['dtime'])
df['hour'] = df['dtime'].dt.hour
df['day_of_week'] = df['dtime'].dt.dayofweek
df['day_of_month'] = df['dtime'].dt.day

# Usuwanie zbędnych kolumn
df = df.drop(['ddate', 'dtime', 'unit_id', 'packets_received', 'packets_sent', 'successes', 'failures',], axis=1)

# Podział na cechy i zmienną docelową
X = df.drop([target], axis=1)
y = df[target]

# Transformacje danych
numeric_features = ['packet_size', 
                   'bytes_total', 'duration', 
                   'hour', 'day_of_week', 'day_of_month']

categorical_features = ['target', 'address', 'error_code']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


# Model 1: Random Forest
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=10,
        max_depth=5,
        random_state=42
    ))
])

# Model 2: XGBoost
model_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        n_estimators=9,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
])

# Model 3: CatBoostRegressor
model_cbr = CatBoostRegressor(
    cat_features=['target', 'address', 'error_code'],
    iterations=20,      
    depth=6,
    verbose=0  # Wyłączenie logów
)

# Lista modeli do ewaluacji
models = {
    'Random Forest': model_rf, #0.94
    'XGBoost': model_xgb, #0.76
    'cat' : model_cbr,#0.95
}


<h2>Pojedynczo</h2>

In [15]:
# Podział na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Trenowanie i ewaluacja modeli
results = {}
name = 'Random Forest'
model = models[name]

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

results[name] = {
    'MAE': mean_absolute_error(y_test, y_pred),
    'Mean': y_test.mean(),
    'R2': r2_score(y_test, y_pred)
}

# Prezentacja wyników
for model_name, metrics in results.items():
    print(f"\n{model_name} Metrics:")
    print(f"MAE: {metrics['MAE']:.2f}")
    print(f"Mean: {metrics['Mean']:.2f}")
    print(f"R2 Score: {metrics['R2']:.2f}")


Random Forest Metrics:
MAE: 771445.97
Mean: 6009712.15
R2 Score: 0.94


<h3>wszystkie na raz</h3>

In [23]:
results = []

for i in range(2):
    # Podział na zbiór treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    for name, model in models.items():
        start = time.time()
        model.fit(X_train, y_train)
        stop = time.time()
        
        y_pred = model.predict(X_test)
        
        results.append({
            'Run': i+1,
            'Model': name,
            'Mean': y_test.mean(),
            'R2': r2_score(y_test, y_pred),
            'Time': stop - start
        })



In [26]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=['Model', 'Run'])
print(results_df)

   Run          Model          Mean        R2       Time
0    1  Random Forest  6.030637e+06  0.935001  18.116797
3    2  Random Forest  5.971373e+06  0.945683  17.766494
1    1        XGBoost  6.030637e+06  0.774336   1.140022
4    2        XGBoost  5.971373e+06  0.785781   1.065281
2    1            cat  6.030637e+06  0.947823   3.005976
5    2            cat  5.971373e+06  0.961607   2.935042


<h1>testy dla przykładowych danych</h1>

In [137]:

sample_data = {
    'unit_id': [90021129],
    'ddate': ['2023-01-30'],
    'dtime': ['2023-01-31 01:37:24'],
    'target': ['sp1-vm-miami-us.samknows.com'],
    'address': ['151.139.187.1'],
    'packets_received': [100],
    'packets_sent': [100],
    'packet_size': [1400],
    'bytes_total': [140000],
    'duration': [63061],
    'error_code': ['NO_ERROR'],
    'successes': [1],
    'failures': [0]
}
sample_df = pd.DataFrame(sample_data)
sample_df['dtime'] = pd.to_datetime(sample_df['dtime'])
sample_df['hour'] = sample_df['dtime'].dt.hour
sample_df['day_of_week'] = sample_df['dtime'].dt.dayofweek
sample_df['day_of_month'] = sample_df['dtime'].dt.day
sample_df = sample_df.drop(['ddate', 'dtime'], axis=1)

# Predykcja
rf_prediction = model.predict(sample_df)

In [141]:
# Wizualizacja wyniku
print("\n=== Predykcja dla przykładowego rekordu ===")
print(f"\nPredykcja : {rf_prediction[0]:.2f}")
print("Powinno wyjść: 2220073")


=== Predykcja dla przykładowego rekordu ===

Predykcja : 3697630.75
Powinno wyjść: 2220073
