In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import pandas as pd
import numpy as np
import itertools
import time

In [2]:
challenge_set_df = pd.read_csv('../../data/encoded_challenge_set.csv')
#display(challenge_set_df)
#display(challenge_set_df.describe())
#display(challenge_set_df.info())

In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
def test_model(data_df, features_columns, target_column, n):
    features = data_df[features_columns]
    target = data_df[target_column]
    X_array = features.values
    y_array = target.values
    mse_scores = []
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X_array):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        #model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')
        model = KNeighborsRegressor(n_neighbors=n)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_scores.append(rmse(y_test, y_pred))
    return np.mean(mse_scores)

def scale_data(dataframe, features, scale_type):
    result_df = dataframe.copy()
    if scale_type == "Normalization":
        scaler = MinMaxScaler(feature_range=(0, 1))
    elif scale_type == "Standardization":
        scaler = StandardScaler()
    result_df[features] = scaler.fit_transform(result_df[features])
    return result_df

In [4]:
feature_columns = [
    'month-day', 'month', 'day_in_month', 'adep', 'country_code_adep', 'ades', 'country_code_ades', 'aircraft_type',
    'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'actual_offblock_time_hour', 'actual_offblock_time_minute',
    'actual_offblock_time_hour_minute', 'arrival_time_hour', 'arrival_time_minute', 'arrival_time_hour_minute', 'latitude_median', 
    'longitude_median', 'altitude_mean', 'groundspeed_mean', 'track_mean', 'vertical_rate_mean', 'track_unwrapped_mean', 'u_component_of_wind_mean', 
    'v_component_of_wind_mean', 'temperature_mean', 'specific_humidity_mean'
]

feature_columns = [
    'month-day', 'month', 'day_in_month', 'adep', 'country_code_adep', 'ades', 'country_code_ades', 'aircraft_type',
    'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'actual_offblock_time_hour', 'actual_offblock_time_minute',
    'actual_offblock_time_hour_minute', 'arrival_time_hour', 'arrival_time_minute', 'arrival_time_hour_minute'
]

target_column = 'tow'

df = challenge_set_df[feature_columns + [target_column]]
#display(df)
#display(df.describe())
#display(df.info())

In [5]:
normalized_df = scale_data(dataframe=df, features=feature_columns, scale_type="Normalization")
#display(normalized_df)
#display(normalized_df.describe())
#display(normalized_df.info())

In [6]:
standardized_df = scale_data(dataframe=df, features=feature_columns, scale_type="Standardization")
#display(standardized_df)
#display(standardized_df.describe().round())
#display(standardized_df.info())

In [7]:
print("KNN with Normalization")
for n in range(1, 10, 1):
    print(f"{n = }")
    start_time = time.time()
    rmse_score = test_model(normalized_df, feature_columns, target_column, n)
    print(f"{rmse_score = }")
    print(f"Execution time: {time.time() - start_time} seconds")
    print()

KNN with Normalization
n = 1


found 0 physical cores < 1
  File "C:\Users\MOHAMMEDG\Anaconda3\envs\prc\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


rmse_score = 9542.174884865412
Execution time: 126.1447913646698 seconds

n = 2
rmse_score = 8548.650932308681
Execution time: 135.5787901878357 seconds

n = 3
rmse_score = 8277.415254238866
Execution time: 122.00680828094482 seconds

n = 4
rmse_score = 8219.39652872981
Execution time: 121.18334221839905 seconds

n = 5
rmse_score = 8207.934188597848
Execution time: 125.52816104888916 seconds

n = 6
rmse_score = 8223.588717174773
Execution time: 131.73410773277283 seconds

n = 7
rmse_score = 8256.897976718734
Execution time: 143.79837131500244 seconds

n = 8
rmse_score = 8311.202968174945
Execution time: 153.47965550422668 seconds

n = 9
rmse_score = 8374.86285669401
Execution time: 153.14995622634888 seconds



In [None]:
print("KNN with Standarization")
for n in range(1, 10, 1):
    print(f"{n = }")
    start_time = time.time()
    rmse_score = test_model(standardized_df, feature_columns, target_column, n)
    print(f"{rmse_score = }")
    print(f"Execution time: {time.time() - start_time} seconds")
    print()

KNN with Standarization
n = 1
rmse_score = 8955.534460265557
Execution time: 115.42830061912537 seconds

n = 2
rmse_score = 8033.001826710592
Execution time: 148.0676429271698 seconds

n = 3
rmse_score = 7809.959302066108
Execution time: 137.72832322120667 seconds

n = 4
rmse_score = 7727.740588677475
Execution time: 223.73109197616577 seconds

n = 5
