In [None]:
import pandas as pd

In [None]:
flight_df_anova = pd.read_csv(r'C:\Users\macie\OneDrive\Pulpit\Studia PJATK\flight_df_ANOVA.csv', delimiter=';')

In [None]:
flight_df_anova

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
flight_df_anova.dropna(subset=['arr_delay_new', 'day_of_week', 'prcp', 'snow', 'tmax'], inplace=True)

In [None]:
bins = [0, 0.1, 0.3, float('inf')]
labels = ['low', 'medium', 'high']
flight_df_anova['PRCP_CAT'] = pd.cut(flight_df_anova['prcp'], bins=bins, labels=labels)

In [None]:
model_day = ols('arr_delay_new ~ C(day_of_week)', data=flight_df_anova).fit()
anova_day = sm.stats.anova_lm(model_day, typ=2)

In [None]:
model_weather = ols('arr_delay_new ~ C(PRCP_CAT) + C(snow) + C(tmax)', data=flight_df_anova).fit()
anova_weather = sm.stats.anova_lm(model_weather, typ=2)

In [None]:
print("ANOVA dla dni tygodnia:\n", anova_day)
print("ANOVA dla warunków pogodowych:\n", anova_weather)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(style="whitegrid")

# Wizualizacja opóźnień dla dni tygodnia
plt.figure(figsize=(10, 6))
sns.barplot(x='day_of_week', y='arr_delay_new', data=flight_df_anova, ci=None)
plt.title('Średnie opóźnienia w zależności od dnia tygodnia')
plt.xlabel('Dzień tygodnia')
plt.ylabel('Średnie opóźnienie (minuty)')
plt.show()

# Wizualizacja opóźnień dla różnych kategorii opadów
plt.figure(figsize=(10, 6))
sns.barplot(x='PRCP_CAT', y='arr_delay_new', data=flight_df_anova, ci=None)
plt.title('Średnie opóźnienia w zależności od intensywności opadów')
plt.xlabel('Kategoria opadów')
plt.ylabel('Średnie opóźnienie (minuty)')
plt.show()

# Wizualizacja opóźnień dla obecności śniegu
plt.figure(figsize=(10, 6))
sns.barplot(x='snow', y='arr_delay_new', data=flight_df_anova, ci=None)
plt.title('Średnie opóźnienia w zależności od obecności śniegu')
plt.xlabel('Śnieg (tak/nie)')
plt.ylabel('Średnie opóźnienie (minuty)')
plt.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np


days = ["Poniedziałek", "Wtorek", "Środa", "Czwartek", "Piątek", "Sobota", "Niedziela"]
means = [10, 15, 13, 12, 16, 18, 14]  # Średnie opóźnienia dla dni
std_devs = [5, 3, 4, 4, 5, 6, 5]  # Odchylenia standardowe

# Tworzenie wykresów
fig = make_subplots(rows=1, cols=7, subplot_titles=days)

x_vals = np.linspace(-10, 40, 100)
for i, (mean, std) in enumerate(zip(means, std_devs)):
    y_vals = np.exp(-(x_vals - mean)**2 / (2 * std**2)) / (std * np.sqrt(2 * np.pi))
    fig.add_trace(go.Scatter(x=x_vals, y=y_vals, name=days[i]), row=1, col=i+1)

fig.update_layout(height=600, width=1200, title_text="Rozkłady opóźnień lotów dla różnych dni tygodnia")
fig.show()

In [None]:
import numpy as np
from scipy import stats

In [None]:

flight_data = flight_df_anova.dropna(subset=['arr_delay_new', 'day_of_week'])

# Przygotowanie listy do przechowywania danych dla każdego dnia tygodnia
data_by_day = [flight_df_anova[flight_df_anova['day_of_week'] == day]['arr_delay_new'].values for day in range(1, 8)]

# Funkcja obliczająca statystykę F
def f_statistic(data_groups):
    # Obliczanie średnich dla każdej grupy
    group_means = np.array([np.mean(group) for group in data_groups])
    grand_mean = np.mean(group_means)  # Ogólna średnia

    # Obliczanie średniej kwadratowej między grupami (MSTr)
    MSTr = sum(len(group) * (group_mean - grand_mean)**2 for group, group_mean in zip(data_groups, group_means)) / (len(data_groups) - 1)
    
    # Obliczanie błędu średniokwadratowego (MSE)
    MSE = sum(sum((x - group_mean)**2 for x in group) for group, group_mean in zip(data_groups, group_means)) / (sum(len(group) for group in data_groups) - len(data_groups))
    
    # Obliczanie statystyki F
    F = MSTr / MSE
    return F

# Obliczanie statystyki F
F_value = f_statistic(data_by_day)
print("F-statistic:", F_value)

# Wykonanie testu ANOVA 
F, p_value = stats.f_oneway(*data_by_day)
print("F-statistic (SciPy):", F)
print("P-value:", p_value)


In [None]:
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f

 
x = np.linspace(0.01, 5, 500)


dfs = [(2, 20), (2, 40), (5, 20), (5, 40)]


fig = go.Figure()


for v1, v2 in dfs:
    y = f.pdf(x, v1, v2)  
    fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=f'v1={v1}, v2={v2}'))


fig.update_layout(
    title='F-distributions',
    xaxis_title='x',
    yaxis_title='f(x)',
    legend_title='Parameters v1, v2'
)


fig.show()


In [None]:
import plotly.graph_objects as go
import numpy as np
from scipy.stats import f


x = np.linspace(0.01, 150, 1000) 

df_between = 6
df_within = 704753


y = f.pdf(x, df_between, df_within)


fig = go.Figure()

fig.add_trace(go.Scatter(x=x, y=y, mode='lines', name=f'df_between={df_between}, df_within={df_within}'))


fig.add_trace(go.Scatter(x=[132.748, 132.748], y=[0, max(y)], mode='lines', line=dict(dash='dash'), name='Twoja F-statystyka'))


fig.update_layout(
    title='F-distributions with Your F-statistic',
    xaxis_title='Wartość F',
    yaxis_title='Gęstość prawdopodobieństwa',
    xaxis_range=[0,150],
    legend_title='Legenda'
)


fig.show()


In [None]:
flight_df_anova.describe

In [None]:
flight_df_anova.info

In [None]:
flight_df_anova.columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Użyj mniejszej próby danych
flight_df_sample = flight_df_anova.sample(frac=0.1, random_state=42)


flight_df_sample.replace([np.inf, -np.inf], np.nan, inplace=True)
flight_df_sample.fillna(flight_df_sample.median(), inplace=True)  


feature_cols = ['month', 'day_of_month', 'day_of_week', 'crs_dep_time', 'crs_arr_time', 
                'op_unique_carrier', 'origin_airport_id', 'dest_airport_id', 'distance', 
                'prcp', 'snow', 'tmax', 'awnd', 'dep_delay', 'is_weekend']
X = flight_df_sample[feature_cols]
y = flight_df_sample['arr_delay_new']

numeric_features = ['month', 'day_of_month', 'day_of_week', 'crs_dep_time', 'crs_arr_time', 
                    'distance', 'prcp', 'snow', 'tmax', 'awnd', 'dep_delay']
categorical_features = ['op_unique_carrier', 'origin_airport_id', 'dest_airport_id', 'is_weekend']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))  
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

# Przygotowanie próbki danych (używamy 10% danych dla szybszego przetwarzania)
flight_df_sample = flight_df_anova.sample(frac=0.1, random_state=42)


flight_df_sample.replace([np.inf, -np.inf], np.nan, inplace=True)
flight_df_sample.fillna(flight_df_sample.median(), inplace=True) 


feature_cols = ['month', 'day_of_month', 'day_of_week', 'crs_dep_time', 'crs_arr_time', 
                'op_unique_carrier', 'origin_airport_id', 'dest_airport_id', 'distance', 
                'prcp', 'snow', 'tmax', 'awnd', 'dep_delay', 'is_weekend']
X = flight_df_sample[feature_cols]
y = flight_df_sample['arr_delay_new']

numeric_features = ['month', 'day_of_month', 'day_of_week', 'crs_dep_time', 'crs_arr_time', 
                    'distance', 'prcp', 'snow', 'tmax', 'awnd', 'dep_delay']
categorical_features = ['op_unique_carrier', 'origin_airport_id', 'dest_airport_id', 'is_weekend']

# Definiowanie preprocesora
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Definiowanie modelu
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))  # Zwiększenie liczby drzew
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Trenowanie modelu
model.fit(X_train, y_train)

# Predykcje i ocena modelu
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

# Wyświetlanie kilku predykcji i rzeczywistych wartości dla porównania
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df.head(10))


In [None]:
flight_df_anova.columns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd


flight_df_sample = flight_df_anova.sample(frac=0.1, random_state=42)


flight_df_sample.replace([np.inf, -np.inf], np.nan, inplace=True)
flight_df_sample.fillna(flight_df_sample.median(), inplace=True)  


feature_cols = ['month', 'day_of_month', 'day_of_week', 'crs_dep_time', 'crs_arr_time', 
                'op_unique_carrier', 'origin_airport_id', 'dest_airport_id', 'distance', 
                'prcp', 'snow', 'tmax', 'awnd', 'dep_delay', 'is_weekend']
X = flight_df_sample[feature_cols]
y = flight_df_sample['arr_delay_new']

numeric_features = ['month', 'day_of_month', 'day_of_week', 'crs_dep_time', 'crs_arr_time', 
                    'distance', 'prcp', 'snow', 'tmax', 'awnd', 'dep_delay']
categorical_features = ['op_unique_carrier', 'origin_airport_id', 'dest_airport_id', 'is_weekend']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')


comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df.head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot: Rzeczywiste opóźnienia vs przewidywane opóźnienia
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel('Rzeczywiste opóźnienia (minuty)')
plt.ylabel('Przewidywane opóźnienia (minuty)')
plt.title('Rzeczywiste vs Przewidywane opóźnienia')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', lw=2)
plt.show()

# Wykres reszt: Różnica między rzeczywistymi a przewidywanymi wartościami
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.3)
plt.xlabel('Przewidywane opóźnienia (minuty)')
plt.ylabel('Reszty (minuty)')
plt.title('Reszty vs Przewidywane opóźnienia')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

# Histogram reszt
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Reszty (minuty)')
plt.ylabel('Częstotliwość')
plt.title('Histogram reszt')
plt.show()



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Box Plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=comparison_df, orient='h')
plt.title('Box Plot: Rzeczywiste vs Przewidywane opóźnienia')
plt.xlabel('Opóźnienia (minuty)')
plt.show()

# Density Plot
plt.figure(figsize=(12, 6))
sns.kdeplot(comparison_df['Actual'], label='Rzeczywiste', shade=True)
sns.kdeplot(comparison_df['Predicted'], label='Przewidywane', shade=True)
plt.title('Density Plot: Rzeczywiste vs Przewidywane opóźnienia')
plt.xlabel('Opóźnienia (minuty)')
plt.legend()
plt.show()


plt.figure(figsize=(12, 6))
comparison_df_sorted = comparison_df.sort_values(by='Actual').reset_index(drop=True)
plt.plot(comparison_df_sorted['Actual'], label='Rzeczywiste')
plt.plot(comparison_df_sorted['Predicted'], label='Przewidywane', alpha=0.7)
plt.title('Line Plot: Rzeczywiste vs Przewidywane opóźnienia')
plt.xlabel('Próbki')
plt.ylabel('Opóźnienia (minuty)')
plt.legend()
plt.show()
