In [219]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [220]:
train_df = pd.read_csv("training.csv", sep="\t")
valid_df = pd.read_csv("validation.csv", sep="\t")

In [221]:
train_df.head()

Unnamed: 0,DepartureDate,DepartureYear,DepartureMonth,DepartureDay,FlightNumber,DepartureAirport,ArrivalAirport,Route,ActualFlightTime,ActualTotalFuel,ActualTOW,FLownPassengers,BagsCount,FlightBagsWeight
0,01/10/2016,2016,10,1,1145,MAN,SXF,MAN-SXF,91,3660,64016,175,61,440
1,01/10/2016,2016,10,1,1160,CTA,FCO,CTA-FCO,68,3280,66138,182,27,350
2,01/10/2016,2016,10,1,1183,LGW,SNN,LGW-SNN,64,2720,58447,127,(null),(null)
3,01/10/2016,2016,10,1,1220,ATH,CHQ,ATH-CHQ,35,1682,60587,163,12,150
4,01/10/2016,2016,10,1,1225,CHQ,ATH,CHQ-ATH,34,1877,63090,178,37,490


In [222]:
valid_df.head()

Unnamed: 0,DepartureDate,DepartureYear,DepartureMonth,DepartureDay,FlightNumber,DepartureAirport,ArrivalAirport,Route,ActualFlightTime,ActualTotalFuel,FLownPassengers,BagsCount,FlightBagsWeight
0,01/11/2016,2016,11,1,6003,MRS,BES,MRS-BES,84,3630,185,43,560
1,01/11/2016,2016,11,1,6004,BES,MRS,BES-MRS,86,3690,188,(null),(null)
2,01/11/2016,2016,11,1,6542,MRS,STN,MRS-STN,100,4390,168,55,720
3,01/11/2016,2016,11,1,6541,STN,MRS,STN-MRS,100,4690,185,35,460
4,01/11/2016,2016,11,1,5542,STN,REU,STN-REU,112,4900,167,31,400


In [223]:
print(f"Training data: {len(train_df)}")
print(f"Validation data: {len(valid_df)}")

Training data: 29731
Validation data: 1878


In [224]:
print(train_df.columns)

Index(['DepartureDate', 'DepartureYear', 'DepartureMonth', 'DepartureDay',
       'FlightNumber', 'DepartureAirport', 'ArrivalAirport', 'Route',
       'ActualFlightTime', 'ActualTotalFuel', 'ActualTOW', 'FLownPassengers',
       'BagsCount', 'FlightBagsWeight'],
      dtype='object')


In [225]:
train_df.rename(columns={'FLownPassengers': 'FlownPassengers'}, inplace=True)

In [226]:
print(valid_df.columns)

Index(['DepartureDate', 'DepartureYear', 'DepartureMonth', 'DepartureDay',
       'FlightNumber', 'DepartureAirport', 'ArrivalAirport', 'Route',
       'ActualFlightTime', 'ActualTotalFuel', 'FLownPassengers', 'BagsCount',
       'FlightBagsWeight'],
      dtype='object')


In [227]:
valid_df.rename(columns={'FLownPassengers': 'FlownPassengers'}, inplace=True)

In [228]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29731 entries, 0 to 29730
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   DepartureDate     29731 non-null  object
 1   DepartureYear     29731 non-null  int64 
 2   DepartureMonth    29731 non-null  int64 
 3   DepartureDay      29731 non-null  int64 
 4   FlightNumber      29731 non-null  int64 
 5   DepartureAirport  29731 non-null  object
 6   ArrivalAirport    29731 non-null  object
 7   Route             29731 non-null  object
 8   ActualFlightTime  29731 non-null  int64 
 9   ActualTotalFuel   29731 non-null  int64 
 10  ActualTOW         29731 non-null  object
 11  FlownPassengers   29731 non-null  object
 12  BagsCount         29731 non-null  object
 13  FlightBagsWeight  29731 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.2+ MB
None


In [229]:
numeric_columns = ['ActualFlightTime', 'ActualTotalFuel', 'ActualTOW',
                   'FlownPassengers', 'BagsCount', 'FlightBagsWeight']

for col in numeric_columns:
    if col in train_df.columns:
        train_df[col] = pd.to_numeric(train_df[col], errors='coerce')

In [230]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29731 entries, 0 to 29730
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   DepartureDate     29731 non-null  object 
 1   DepartureYear     29731 non-null  int64  
 2   DepartureMonth    29731 non-null  int64  
 3   DepartureDay      29731 non-null  int64  
 4   FlightNumber      29731 non-null  int64  
 5   DepartureAirport  29731 non-null  object 
 6   ArrivalAirport    29731 non-null  object 
 7   Route             29731 non-null  object 
 8   ActualFlightTime  29731 non-null  int64  
 9   ActualTotalFuel   29731 non-null  int64  
 10  ActualTOW         29298 non-null  float64
 11  FlownPassengers   29636 non-null  float64
 12  BagsCount         27447 non-null  float64
 13  FlightBagsWeight  27253 non-null  float64
dtypes: float64(4), int64(6), object(4)
memory usage: 3.2+ MB
None


In [231]:
# Konwersja daty
train_df["DepartureDate"] = pd.to_datetime(train_df["DepartureDate"], format="%d/%m/%Y")
train_df["DayOfWeek"] = train_df["DepartureDate"].dt.weekday

# Inżynieria cech
train_df["FuelPerMinute"] = train_df["ActualTotalFuel"] / train_df["ActualFlightTime"]
train_df["PassengersPerBag"] = train_df["FlownPassengers"] / train_df["BagsCount"]

In [232]:
n_missing_rows = train_df.isnull().any(axis=1).sum()
n_complete_rows = train_df.shape[0] - n_missing_rows

print(f"Missing rows: {n_missing_rows}")
print(f"Full rows: {n_complete_rows}")

Missing rows: 2931
Full rows: 26800


In [233]:
print(train_df.isna().sum())

DepartureDate          0
DepartureYear          0
DepartureMonth         0
DepartureDay           0
FlightNumber           0
DepartureAirport       0
ArrivalAirport         0
Route                  0
ActualFlightTime       0
ActualTotalFuel        0
ActualTOW            433
FlownPassengers       95
BagsCount           2284
FlightBagsWeight    2478
DayOfWeek              0
FuelPerMinute          0
PassengersPerBag    2284
dtype: int64


In [234]:
train_df = train_df.dropna()

In [235]:
print(train_df.isna().sum())

DepartureDate       0
DepartureYear       0
DepartureMonth      0
DepartureDay        0
FlightNumber        0
DepartureAirport    0
ArrivalAirport      0
Route               0
ActualFlightTime    0
ActualTotalFuel     0
ActualTOW           0
FlownPassengers     0
BagsCount           0
FlightBagsWeight    0
DayOfWeek           0
FuelPerMinute       0
PassengersPerBag    0
dtype: int64


In [236]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 26800 entries, 0 to 29730
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   DepartureDate     26800 non-null  datetime64[ns]
 1   DepartureYear     26800 non-null  int64         
 2   DepartureMonth    26800 non-null  int64         
 3   DepartureDay      26800 non-null  int64         
 4   FlightNumber      26800 non-null  int64         
 5   DepartureAirport  26800 non-null  object        
 6   ArrivalAirport    26800 non-null  object        
 7   Route             26800 non-null  object        
 8   ActualFlightTime  26800 non-null  int64         
 9   ActualTotalFuel   26800 non-null  int64         
 10  ActualTOW         26800 non-null  float64       
 11  FlownPassengers   26800 non-null  float64       
 12  BagsCount         26800 non-null  float64       
 13  FlightBagsWeight  26800 non-null  float64       
 14  DayOfWeek         26800 non

In [237]:
train_df.drop(columns=["DepartureDate"], inplace=True)

In [238]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

# Wszystkie dostępne cechy (bez targetu)
all_features = [
    'DepartureYear', 'DepartureMonth', 'DepartureDay',
    'FlightNumber', 'DepartureAirport', 'ArrivalAirport', 'Route',
    'ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers',
    'BagsCount', 'FlightBagsWeight', 'DayOfWeek', 'FuelPerMinute',
    'PassengersPerBag'
]

X = train_df[all_features].copy()
y = train_df['ActualTOW']

# Konwersja kolumn typu object na category
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

def objective(trial):
    # Wybór cech do użycia w danej próbie
    selected_features = [f for f in all_features if trial.suggest_categorical(f'use_{f}', [True, False])]
    if len(selected_features) == 0:
        return float('inf')  # nie trenuj bez żadnych cech

    X_selected = X[selected_features]

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'tree_method': 'hist',
        'enable_categorical': True,
        'random_state': 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_bins = pd.qcut(y, q=10, duplicates='drop', labels=False)
    mse_scores = []

    for train_idx, val_idx in skf.split(X_selected, y_bins):
        X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        y_pred = model.predict(X_val)
        mse_scores.append(mean_squared_error(y_val, y_pred))

    return np.mean(mse_scores)

# Optymalizacja
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Best parameters:")
print(study.best_params)
print(f"Best MSE: {study.best_value:.2f}")

# Wybrane cechy
best_features = [f for f in all_features if study.best_params.get(f'use_' + f, False)]
print("Features:")
print(best_features)

[I 2025-05-28 23:40:05,175] A new study created in memory with name: no-name-23441133-ddbf-414c-8b9a-c8b0f3a1d221
[I 2025-05-28 23:40:13,631] Trial 0 finished with value: 866821.6000431341 and parameters: {'use_DepartureYear': False, 'use_DepartureMonth': True, 'use_DepartureDay': True, 'use_FlightNumber': False, 'use_DepartureAirport': True, 'use_ArrivalAirport': False, 'use_Route': False, 'use_ActualFlightTime': False, 'use_ActualTotalFuel': True, 'use_FlownPassengers': True, 'use_BagsCount': True, 'use_FlightBagsWeight': True, 'use_DayOfWeek': True, 'use_FuelPerMinute': True, 'use_PassengersPerBag': True, 'n_estimators': 108, 'max_depth': 10, 'learning_rate': 0.1944708296363903, 'subsample': 0.8844808174428412, 'colsample_bytree': 0.9283070067843833, 'gamma': 3.4846593425534342, 'reg_alpha': 3.6792253641522654, 'reg_lambda': 2.385180850211572}. Best is trial 0 with value: 866821.6000431341.
[I 2025-05-28 23:40:17,997] Trial 1 finished with value: 2013166.5517884206 and parameters: {

Najlepsze parametry:
{'use_DepartureYear': True, 'use_DepartureMonth': False, 'use_DepartureDay': False, 'use_FlightNumber': True, 'use_DepartureAirport': True, 'use_ArrivalAirport': False, 'use_Route': True, 'use_ActualFlightTime': True, 'use_ActualTotalFuel': False, 'use_FlownPassengers': True, 'use_BagsCount': False, 'use_FlightBagsWeight': True, 'use_DayOfWeek': False, 'use_FuelPerMinute': True, 'use_PassengersPerBag': True, 'n_estimators': 268, 'max_depth': 4, 'learning_rate': 0.11415274816071591, 'subsample': 0.6213755475324894, 'colsample_bytree': 0.6007337769941724, 'gamma': 3.814335337714401, 'reg_alpha': 2.1140963044701433, 'reg_lambda': 3.711107728940596}
Najniższe MSE: 513417.10
Wybrane cechy:
['DepartureYear', 'FlightNumber', 'DepartureAirport', 'Route', 'ActualFlightTime', 'FlownPassengers', 'FlightBagsWeight', 'FuelPerMinute', 'PassengersPerBag']


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

features = list(train_df.columns.difference(['ActualTOW']))


X = train_df[features].copy()
y = train_df['ActualTOW']

for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'tree_method': 'hist',
        'enable_categorical': True,
        'random_state': 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []

    y_bins = pd.qcut(y, q=10, duplicates='drop', labels=False)

    for train_idx, val_idx in skf.split(X, y_bins):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )


        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)


# -------------------------
# Uruchomienie Optuny
# -------------------------
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

print("Najlepsze parametry:")
print(study.best_params)
print(f"Najniższe MSE: {study.best_value:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

KeyboardInterrupt: 

In [None]:
#-------------------------


ull_df = pd.concat([training_df, validation_df], ignore_index=True)

len(full_df)

31609

In [119]:
print(full_df.columns)

Index(['DepartureDate', 'DepartureYear', 'DepartureMonth', 'DepartureDay',
       'FlightNumber', 'DepartureAirport', 'ArrivalAirport', 'Route',
       'ActualFlightTime', 'ActualTotalFuel', 'ActualTOW', 'FLownPassengers',
       'BagsCount', 'FlightBagsWeight'],
      dtype='object')


In [120]:
full_df.rename(columns={'FLownPassengers': 'FlownPassengers'}, inplace=True)

In [121]:
print(full_df.columns)

Index(['DepartureDate', 'DepartureYear', 'DepartureMonth', 'DepartureDay',
       'FlightNumber', 'DepartureAirport', 'ArrivalAirport', 'Route',
       'ActualFlightTime', 'ActualTotalFuel', 'ActualTOW', 'FlownPassengers',
       'BagsCount', 'FlightBagsWeight'],
      dtype='object')


In [115]:
n_missing_rows = full_df.isnull().any(axis=1).sum()

n_complete_rows = full_df.shape[0] - n_missing_rows

print(f"Wiersze z brakami: {n_missing_rows}")
print(f"Wiersze kompletne: {n_complete_rows}")

Wiersze z brakami: 1878
Wiersze kompletne: 29731


In [138]:
full_df = full_df.dropna()

In [134]:
print(full_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 29731 entries, 0 to 29730
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   DepartureYear     29731 non-null  int64  
 1   DepartureMonth    29731 non-null  int64  
 2   DepartureDay      29731 non-null  int64  
 3   DepartureAirport  29731 non-null  object 
 4   ArrivalAirport    29731 non-null  object 
 5   Route             29731 non-null  object 
 6   ActualFlightTime  29731 non-null  int64  
 7   ActualTotalFuel   29731 non-null  int64  
 8   ActualTOW         29298 non-null  float64
 9   FlownPassengers   29636 non-null  float64
 10  BagsCount         27447 non-null  float64
 11  FlightBagsWeight  27253 non-null  float64
 12  DayOfWeek         29731 non-null  int32  
 13  FuelPerMinute     29731 non-null  float64
 14  PassengersPerBag  27447 non-null  float64
dtypes: float64(6), int32(1), int64(5), object(3)
memory usage: 3.5+ MB
None


In [130]:
numeric_columns = ['ActualFlightTime', 'ActualTotalFuel', 'ActualTOW',
                   'FlownPassengers', 'BagsCount', 'FlightBagsWeight']

for col in numeric_columns:
    if col in full_df.columns:
        full_df[col] = pd.to_numeric(full_df[col], errors='coerce')

In [133]:
# Konwersja daty
full_df["DepartureDate"] = pd.to_datetime(full_df["DepartureDate"], format="%d/%m/%Y")
full_df["DayOfWeek"] = full_df["DepartureDate"].dt.weekday

# Inżynieria cech
full_df["FuelPerMinute"] = full_df["ActualTotalFuel"] / full_df["ActualFlightTime"]
full_df["PassengersPerBag"] = full_df["FlownPassengers"] / full_df["BagsCount"]

full_df.drop(columns=["DepartureDate", "FlightNumber"], inplace=True)

In [139]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

features = list(full_df.columns)

X = full_df[features]
y = full_df['ActualTOW']

for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'tree_method': 'hist',
        'enable_categorical': True,
        'random_state': 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []

    y_bins = pd.qcut(y, q=10, duplicates='drop', labels=False)

    for train_idx, val_idx in skf.split(X, y_bins):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )


        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)


# -------------------------
# Uruchomienie Optuny
# -------------------------
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("\n📌 Najlepsze parametry:")
print(study.best_params)
print(f"📉 Najniższe MSE: {study.best_value:.2f}")

[I 2025-05-28 22:48:02,694] A new study created in memory with name: no-name-a5f21e2b-89db-4434-98fc-508cc6984922


[I 2025-05-28 22:48:56,738] Trial 0 finished with value: 8869.071750793457 and parameters: {'n_estimators': 305, 'max_depth': 9, 'learning_rate': 0.1520578228363123, 'subsample': 0.9436184952043354, 'colsample_bytree': 0.6667658279732775, 'gamma': 4.605290923589854, 'reg_alpha': 1.925003338024283, 'reg_lambda': 2.7646539941354495}. Best is trial 0 with value: 8869.071750793457.
[I 2025-05-28 22:49:15,286] Trial 1 finished with value: 7753.900690333238 and parameters: {'n_estimators': 121, 'max_depth': 7, 'learning_rate': 0.0694720307540821, 'subsample': 0.9703266268389491, 'colsample_bytree': 0.9564295528602588, 'gamma': 1.4233242221482667, 'reg_alpha': 4.270323110160215, 'reg_lambda': 1.6427397459648552}. Best is trial 1 with value: 7753.900690333238.
[I 2025-05-28 22:49:51,828] Trial 2 finished with value: 7355.442410266364 and parameters: {'n_estimators': 274, 'max_depth': 7, 'learning_rate': 0.18159548030678874, 'subsample': 0.9495147512215804, 'colsample_bytree': 0.964639660089963


📌 Najlepsze parametry:
{'n_estimators': 196, 'max_depth': 5, 'learning_rate': 0.19318316832512794, 'subsample': 0.7417459375178047, 'colsample_bytree': 0.6012945945430963, 'gamma': 3.5472908040724995, 'reg_alpha': 0.9320517025766648, 'reg_lambda': 1.9233715233771256}
📉 Najniższe MSE: 5207.80


In [66]:
numeric_columns = ['ActualFlightTime', 'ActualTotalFuel', 'ActualTOW',
                   'FlownPassengers', 'BagsCount', 'FlightBagsWeight']

for col in numeric_columns:
    if col in full_df.columns:
        full_df[col] = pd.to_numeric(full_df[col], errors='coerce')

In [68]:
# Liczba wierszy z co najmniej jednym NaN
n_missing_rows = full_df.isnull().any(axis=1).sum()

# Liczba pełnych wierszy (bez NaN)
n_complete_rows = full_df.shape[0] - n_missing_rows

print(f"Wiersze z brakami: {n_missing_rows}")
print(f"Wiersze kompletne: {n_complete_rows}")

Wiersze z brakami: 4809
Wiersze kompletne: 26800


In [69]:
full_df = full_df.dropna()

In [70]:
print(full_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 26800 entries, 0 to 29730
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   DepartureDate     26800 non-null  object 
 1   DepartureYear     26800 non-null  int64  
 2   DepartureMonth    26800 non-null  int64  
 3   DepartureDay      26800 non-null  int64  
 4   FlightNumber      26800 non-null  int64  
 5   DepartureAirport  26800 non-null  object 
 6   ArrivalAirport    26800 non-null  object 
 7   Route             26800 non-null  object 
 8   ActualFlightTime  26800 non-null  int64  
 9   ActualTotalFuel   26800 non-null  int64  
 10  ActualTOW         26800 non-null  float64
 11  FlownPassengers   26800 non-null  float64
 12  BagsCount         26800 non-null  float64
 13  FlightBagsWeight  26800 non-null  float64
dtypes: float64(4), int64(6), object(4)
memory usage: 3.1+ MB
None


In [72]:
full_df['DepartureDate'] = pd.to_datetime(full_df['DepartureDate'], dayfirst=True)
full_df['DayOfWeek'] = full_df['DepartureDate'].dt.dayofweek

In [76]:
print(full_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 26800 entries, 0 to 29730
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   DepartureYear     26800 non-null  int64  
 1   DepartureMonth    26800 non-null  int64  
 2   DepartureDay      26800 non-null  int64  
 3   FlightNumber      26800 non-null  int64  
 4   Route             26800 non-null  object 
 5   ActualFlightTime  26800 non-null  int64  
 6   ActualTotalFuel   26800 non-null  int64  
 7   ActualTOW         26800 non-null  float64
 8   FlownPassengers   26800 non-null  float64
 9   BagsCount         26800 non-null  float64
 10  FlightBagsWeight  26800 non-null  float64
 11  DayOfWeek         26800 non-null  int32  
dtypes: float64(4), int32(1), int64(6), object(1)
memory usage: 2.6+ MB
None


In [75]:
full_df = full_df.drop(columns=['DepartureAirport', 'ArrivalAirport', 'DepartureDate'])

In [77]:
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

features = [
    'DepartureYear', 'DepartureMonth', 'DepartureDay', 'DayOfWeek',
    'ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers',
    'BagsCount', 'FlightBagsWeight'
]

X = full_df[features]
y = full_df['ActualTOW']

# Inicjalizacja modelu regresyjnego
#model = XGBRegressor(enable_categorical=True, tree_method="hist")

model = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    enable_categorical=True,
    random_state=42
)


# Zwykła walidacja k-krotna
kf = KFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    mse = mean_squared_error(y_val, y_pred)
    scores.append(mse)
    print(f"Fold {fold + 1} MSE: {mse:.2f}")

print(f"\nŚredni błąd MSE: {np.mean(scores):.2f}")


Fold 1 MSE: 793179.01
Fold 2 MSE: 837396.65
Fold 3 MSE: 836821.61
Fold 4 MSE: 823613.83
Fold 5 MSE: 780310.27

Średni błąd MSE: 814264.27


In [81]:
print(f"Min TOW: {y.min():.0f}, Max TOW: {y.max():.0f}, Średni TOW: {y.mean():.0f}")

Min TOW: 49322, Max TOW: 74283, Średni TOW: 65084


In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor



In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

features = [
    'DepartureYear', 'DepartureMonth', 'DepartureDay', 'DayOfWeek',
    'ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers',
    'BagsCount', 'FlightBagsWeight'
]

X = full_df[features]
y = full_df['ActualTOW']


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'tree_method': 'hist',
        'enable_categorical': True,
        'random_state': 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []

    y_bins = pd.qcut(y, q=10, duplicates='drop', labels=False)

    for train_idx, val_idx in skf.split(X, y_bins):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )


        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)


# -------------------------
# Uruchomienie Optuny
# -------------------------
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("\n📌 Najlepsze parametry:")
print(study.best_params)
print(f"📉 Najniższe MSE: {study.best_value:.2f}")


[I 2025-05-28 21:58:06,458] A new study created in memory with name: no-name-a9dbe207-0965-4bad-910c-65e5b33a2f59


[I 2025-05-28 21:58:08,556] Trial 0 finished with value: 818975.247753096 and parameters: {'n_estimators': 183, 'max_depth': 5, 'learning_rate': 0.1871097085334947, 'subsample': 0.9042959834088742, 'colsample_bytree': 0.7498192823830607, 'gamma': 2.981989317274975, 'reg_alpha': 2.7227131089617163, 'reg_lambda': 4.224067016392835}. Best is trial 0 with value: 818975.247753096.
[I 2025-05-28 21:58:12,856] Trial 1 finished with value: 806993.824973017 and parameters: {'n_estimators': 366, 'max_depth': 6, 'learning_rate': 0.039937061239950264, 'subsample': 0.73845597918649, 'colsample_bytree': 0.9295915019243176, 'gamma': 4.083115955179538, 'reg_alpha': 3.355729959298141, 'reg_lambda': 3.0397308905776836}. Best is trial 1 with value: 806993.824973017.
[I 2025-05-28 21:58:16,972] Trial 2 finished with value: 812906.324994175 and parameters: {'n_estimators': 483, 'max_depth': 4, 'learning_rate': 0.04297111651943098, 'subsample': 0.9509715460272262, 'colsample_bytree': 0.9239411831558368, 'ga


📌 Najlepsze parametry:
{'n_estimators': 411, 'max_depth': 5, 'learning_rate': 0.07502314337610583, 'subsample': 0.8590781412062897, 'colsample_bytree': 0.8523982747742388, 'gamma': 0.11062947175871786, 'reg_alpha': 2.1276494541663147, 'reg_lambda': 4.8181391597355905}
📉 Najniższe MSE: 803392.01


In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

full_df['FuelPerPassenger'] = full_df['ActualTotalFuel'] / (full_df['FlownPassengers'] + 1e-5)
full_df['BagsWeightPerPassenger'] = full_df['FlightBagsWeight'] / (full_df['FlownPassengers'] + 1e-5)
full_df['AvgBagWeight'] = full_df['FlightBagsWeight'] / (full_df['BagsCount'] + 1e-5)
full_df['LogFuel'] = np.log1p(full_df['ActualTotalFuel'])
full_df['LogPassengers'] = np.log1p(full_df['FlownPassengers'])
full_df['LogBagsWeight'] = np.log1p(full_df['FlightBagsWeight'])

features = [
    'DepartureYear', 'DepartureMonth', 'DepartureDay', 'DayOfWeek',
    'ActualFlightTime', 'ActualTotalFuel', 'FlownPassengers',
    'BagsCount', 'FlightBagsWeight',
    'FuelPerPassenger', 'BagsWeightPerPassenger', 'AvgBagWeight',
    'LogFuel', 'LogPassengers', 'LogBagsWeight'
]

X = full_df[features]
y = full_df['ActualTOW']


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'tree_method': 'hist',
        'enable_categorical': True,
        'random_state': 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []

    y_bins = pd.qcut(y, q=10, duplicates='drop', labels=False)

    for train_idx, val_idx in skf.split(X, y_bins):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )


        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)


# -------------------------
# Uruchomienie Optuny
# -------------------------
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("\n📌 Najlepsze parametry:")
print(study.best_params)
print(f"📉 Najniższe MSE: {study.best_value:.2f}")

[I 2025-05-28 22:08:12,384] A new study created in memory with name: no-name-d06bcea5-e0ac-44fd-881e-d628a79154b2


[I 2025-05-28 22:08:15,984] Trial 0 finished with value: 829954.340929941 and parameters: {'n_estimators': 354, 'max_depth': 3, 'learning_rate': 0.0969591085379226, 'subsample': 0.6216135651619114, 'colsample_bytree': 0.6171583069502731, 'gamma': 4.162770894732966, 'reg_alpha': 4.601536203636961, 'reg_lambda': 4.466017686034642}. Best is trial 0 with value: 829954.340929941.
[I 2025-05-28 22:08:20,704] Trial 1 finished with value: 882154.4939544324 and parameters: {'n_estimators': 124, 'max_depth': 9, 'learning_rate': 0.1718100112124136, 'subsample': 0.9347605479960591, 'colsample_bytree': 0.7026673077455887, 'gamma': 4.760438625164005, 'reg_alpha': 0.09434147432140605, 'reg_lambda': 4.066794370043077}. Best is trial 0 with value: 829954.340929941.
[I 2025-05-28 22:08:23,266] Trial 2 finished with value: 829268.074120743 and parameters: {'n_estimators': 172, 'max_depth': 6, 'learning_rate': 0.13302756266121507, 'subsample': 0.7021152452548937, 'colsample_bytree': 0.7174583172706741, 'g


📌 Najlepsze parametry:
{'n_estimators': 494, 'max_depth': 4, 'learning_rate': 0.08001822679499129, 'subsample': 0.908993824486113, 'colsample_bytree': 0.9934794062364312, 'gamma': 1.433857815889707, 'reg_alpha': 3.1108513953725323, 'reg_lambda': 3.811128890324066}
📉 Najniższe MSE: 795008.93


In [None]:
Najlepsze parametry:
{'n_estimators': 411, 'max_depth': 5, 'learning_rate': 0.07502314337610583, 'subsample': 0.8590781412062897, 'colsample_bytree': 0.8523982747742388, 'gamma': 0.11062947175871786, 'reg_alpha': 2.1276494541663147, 'reg_lambda': 4.8181391597355905}
📉 Najniższe MSE: 803392.01

In [8]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [10]:
from sklearn.impute import SimpleImputer

# Uzupełnianie braków medianą
imputer = SimpleImputer(strategy="median")
X_val_imputed = imputer.fit_transform(X_val)


In [11]:
y_pred = model.predict(X_val_imputed)



In [12]:
# Stworzenie dataframe'u z przewidywaniami
output = validation_df.copy()
output['PredictedTOW'] = y_pred

# Zapisz tylko potrzebne kolumny
output[['DepartureDate', 'FlightNumber', 'PredictedTOW']].to_csv("predictions.csv", index=False)