In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import pandas as pd
import numpy as np
import random
import time

In [2]:
identifier_name = 'flight_id'

features_names = [
    'wtc', 
    'aircraft_type', 
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'airline', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'ades', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'adep', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'country_code_adep', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'country_code_ades', 
    'latitude_25percentile', 
    'actual_offblock_time_hour', 
    'longitude_median', 
    'month', 
    'altitude_std', 
    'latitude_count', 
    'taxiout_time', 
    'vertical_rate_median', 
    'month_day'
]

target_name = 'tow'

global_random_state = 123

categorical_columns = [
    'wtc', 
    'aircraft_type', 
    'airline', 
    'ades', 
    'adep', 
    'country_code_adep', 
    'country_code_ades',
    'actual_offblock_time_hour', 
    'month',
    'taxiout_time',
    'month_day'
]

numerical_columns = [
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'latitude_25percentile', 
    'longitude_median', 
    'altitude_std', 
    'latitude_count', 
    'vertical_rate_median'
]

In [3]:
encoded_challenge_set_df = pd.read_csv('./data/encoded_challenge_set.csv')
encoded_challenge_set_df = encoded_challenge_set_df.fillna(0)
challenge_features = encoded_challenge_set_df[features_names]
challenge_target = encoded_challenge_set_df[target_name]

encoded_submission_set_df = pd.read_csv('./data/encoded_submission_set.csv')
encoded_submission_set_df = encoded_submission_set_df.fillna(0)
submission_features = encoded_submission_set_df[features_names]

In [4]:
general_model = xgb.XGBRegressor(
    colsample_bytree=1.0, 
    learning_rate=0.1,
    max_depth=10,
    n_estimators=91,
    subsample= 1.0,
    enable_categorical=True,
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=global_random_state
)

general_model.fit(challenge_features, challenge_target)

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),  # Pass through numerical features
        ('cat', OneHotEncoder(), categorical_columns)  # One-hot encode categorical features
    ]
)

In [6]:
all_features = pd.concat([challenge_features, submission_features], ignore_index=True)
processed_features = preprocessor.fit_transform(all_features)
processed_challenge_data = processed_features[:len(challenge_features)]
processed_submission_data = processed_features[len(challenge_features):]

In [7]:
print(processed_challenge_data.shape)
print(processed_submission_data.shape)

(369013, 1617)
(105959, 1617)


In [None]:
start_at, end_at = 0, len(submission_features)

for start_index in range(0, len(submission_features), 1000):
    if start_index == 105000:
        end_index = 105958
    else:
        end_index = start_index+1000
    print("start_index: ", start_index, ", end_index: ", end_index)
    
    start_time = time.time()
    
    similarity_matrix = cosine_similarity(processed_submission_data[start_index:end_index], processed_challenge_data)
    
    flight_ids, estimated_tow = [], []
    
    general_model_used = 0
    
    for index in range(len(submission_features[start_index:end_index])):
        info_intervall = int((end_index-start_index)/10)
        if index % info_intervall == 0:
            print(f"Running for between {index} and {index+info_intervall}")
    
        submission_flight_id = encoded_submission_set_df.iloc[[index]]['flight_id'].values[0]
        flight_ids.append(submission_flight_id)
            
        challenge_features_with_similarity = challenge_features.copy()
        similarity_values = similarity_matrix[index, :].flatten()
    
    
        challenge_features_with_similarity['similarity'] = similarity_values
        challenge_features_with_similarity = challenge_features_with_similarity.sort_values(by='similarity', ascending=False)
            
        similarity_based_training_data = challenge_features_with_similarity[challenge_features_with_similarity['similarity'] >= 0.999]
        #similarity_based_training_data = challenge_features_with_similarity.head(1000)
    
        if len(similarity_based_training_data) < 1000:
            y_pred = general_model.predict(submission_features.iloc[[index]])[0]
            estimated_tow.append(y_pred)
            general_model_used += 1
        else: 
            similarity_based_training_data_features = similarity_based_training_data[features_names]
            top_similarity_indices = similarity_based_training_data_features.index.tolist()
            similarity_based_training_data_target = challenge_target.iloc[top_similarity_indices]
            
            similarity_based_model = xgb.XGBRegressor(
                colsample_bytree=1.0, 
                learning_rate=0.1,
                max_depth=10,
                n_estimators=91, 
                subsample= 1.0,
                objective='reg:squarederror', 
                eval_metric='rmse',
                random_state=global_random_state
            )
                
            similarity_based_model.fit(similarity_based_training_data_features, similarity_based_training_data_target)
            
            y_pred = similarity_based_model.predict(submission_features.iloc[[index]])[0]
            estimated_tow.append(y_pred)
    
        submission_df = pd.DataFrame({
            'flight_id': flight_ids,
            'tow': estimated_tow
        })
    
        if index % info_intervall == 0:
            end_time = time.time()
            elapsed_time = end_time - start_time
            hours, minutes, seconds = int(elapsed_time // 3600), int((elapsed_time % 3600) // 60), int(elapsed_time % 60)
            print(f"elapsed_time = {hours:02}:{minutes:02}:{seconds:02}")

    submission_df[[identifier_name, target_name]].to_csv(f'./submissions/my_submission_v14_{start_index}_{end_index}.csv', index=False)
    
print("done!\n")
print(f"Used general model {general_model_used} times\n")

display(submission_df)

start_index:  0 , end_index:  1000
Running for between 0 and 100
elapsed_time = 00:00:39
Running for between 100 and 200
elapsed_time = 00:05:12
