In [4]:
%%time
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
data = pd.read_csv(r"D:\users\m_ozdemir15\Desktop\Okul\Y.Lisans\2.Dönem\Makine Öğrenmesi\proje\AirlinesDelayed.csv")
data.head()

# Data preprocessing
data.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Airline', 'AirportFrom', 'AirportTo']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Split the data into X (features) and y (target)
X = data.drop('Delay', axis=1)
y = data['Delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Time Series Forecasting Models
# Assuming you have a time-based column (e.g., 'Time' or 'Date') in the dataset
# You can use Exponential Smoothing and ARIMA for time series forecasting

time_column = 'Time'
data[time_column] = pd.to_datetime(data[time_column])  # Convert to datetime if not already

# Set frequency if data is collected at regular intervals (e.g., daily)
# Skip this step if data is not collected at regular intervals
data.set_index(time_column, inplace=True)
data = data.asfreq('D')  # Use 'M' for monthly data, 'H' for hourly data, etc.

# Ensure date index is monotonic
data.sort_index(inplace=True)

# Create a time series for delays
time_series = data['Delay']

# Exponential Smoothing Model
model_es = ExponentialSmoothing(time_series, seasonal='add', seasonal_periods=7)
model_es = model_es.fit()

# ARIMA Model
model_arima = ARIMA(time_series, order=(5,1,0))
model_arima = model_arima.fit()

# Make forecasts for the test period
forecast_es = model_es.forecast(len(X_test))
forecast_arima, _, _ = model_arima.forecast(steps=len(X_test))

# Time Series Model Evaluation
mae_es = mean_absolute_error(y_test, forecast_es)
mse_es = mean_squared_error(y_test, forecast_es)
print(f'Exponential Smoothing Model MAE: {mae_es}')
print(f'Exponential Smoothing Model MSE: {mse_es}')

mae_arima = mean_absolute_error(y_test, forecast_arima)
mse_arima = mean_squared_error(y_test, forecast_arima)
print(f'ARIMA Model MAE: {mae_arima}')
print(f'ARIMA Model MSE: {mse_arima}')


ValueError: cannot reindex on an axis with duplicate labels