In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from scipy.stats import randint, uniform
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# Load data
logger.info('Loading training and validation data')
train_data = pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/train.csv', low_memory=False)
val_data = pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/validation.csv', low_memory=False)
test_data = pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/test.csv', low_memory=False)

# Define mixed type columns
dtype_dict = {'number': str, 'positionText_x': str, 'time_x': str, 'fastestLapTime': str, 'status': str}

# Reload data with specified dtype
logger.info('Reloading data with specified dtypes')
train_data = pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/train.csv', dtype=dtype_dict, low_memory=False)
val_data = pd.read_csv('/kaggle/input/f1nalyze-datathon-ieeecsmuj/validation.csv', dtype=dtype_dict, low_memory=False)

# Convert to numeric and fill missing values
logger.info('Converting to numeric and filling missing values')
numeric_columns = ['points', 'laps', 'timetaken_in_millisec', 'fastestLap', 'max_speed', 'rank', 'grid', 'positionOrder']
for col in numeric_columns:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    val_data[col] = pd.to_numeric(val_data[col], errors='coerce')

train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
val_data.replace([np.inf, -np.inf], np.nan, inplace=True)
train_data.fillna(method='ffill', inplace=True)
train_data.fillna(method='bfill', inplace=True)
val_data.fillna(method='ffill', inplace=True)
val_data.fillna(method='bfill', inplace=True)

# Feature engineering
logger.info('Performing feature engineering')
train_data['age'] = pd.to_datetime(train_data['date']).dt.year - pd.to_datetime(train_data['dob']).dt.year
val_data['age'] = pd.to_datetime(val_data['date']).dt.year - pd.to_datetime(val_data['dob']).dt.year

train_data['avg_speed'] = train_data['timetaken_in_millisec'] / train_data['laps']
val_data['avg_speed'] = val_data['timetaken_in_millisec'] / val_data['laps']

train_data['start_grid_diff'] = train_data['grid'] - train_data['positionOrder']
val_data['start_grid_diff'] = val_data['grid'] - val_data['positionOrder']

# Label encoding for categorical variables
logger.info('Label encoding categorical variables')
label_encoders = {}
categorical_columns = ['driverRef', 'constructorRef']

for col in categorical_columns:
    le = LabelEncoder()
    le.fit(train_data[col])  # Fit on training data
    train_data[col] = le.transform(train_data[col])
    val_data[col] = val_data[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
    label_encoders[col] = le

# Define features
features = [
    'grid', 'points', 'laps', 'timetaken_in_millisec', 'fastestLap', 'max_speed', 
    'age', 'avg_speed', 'start_grid_diff', 'rank', 'year', 'round', 'circuitId', 
    'driverRef', 'constructorRef'
]

# Prepare training and validation sets
logger.info('Preparing training and validation sets')
X_train = train_data[features]
y_train = train_data['position']
X_val = val_data[features]
y_val = val_data['position']

# Model tuning with RandomizedSearchCV for LightGBM
logger.info('Starting RandomizedSearchCV for LightGBM')
model = LGBMRegressor(random_state=42)
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 7),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best model and prediction
logger.info('Best model and prediction')
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
logger.info(f'Validation RMSE: {rmse}')

# Test data preparation and prediction
logger.info('Preparing test data and making predictions')
for col in numeric_columns:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

test_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

test_data['age'] = pd.to_datetime(test_data['date']).dt.year - pd.to_datetime(test_data['dob']).dt.year
test_data['avg_speed'] = test_data['timetaken_in_millisec'] / test_data['laps']
test_data['start_grid_diff'] = test_data['grid'] - test_data['positionOrder']

for col in categorical_columns:
    test_data[col] = test_data[col].map(lambda s: label_encoders[col].transform([s])[0] if s in label_encoders[col].classes_ else -1)

X_test = test_data[features]
y_test_pred = best_model.predict(X_test)

# Submission
logger.info('Creating submission file')
submission = pd.DataFrame({'result_driver_standing': test_data['result_driver_standing'], 'position': y_test_pred})
submission.to_csv('submission_lgbm.csv', index=False)
logger.info('Submission file created')


  train_data.fillna(method='ffill', inplace=True)
  train_data.fillna(method='bfill', inplace=True)
  val_data.fillna(method='ffill', inplace=True)
  val_data.fillna(method='bfill', inplace=True)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.698918 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 1886734, number of used features: 15
[LightGBM] [Info] Start training from score 11.105223
[CV] END colsample_bytree=0.749816047538945, learning_rate=0.10507143064099161, max_depth=5, n_estimators=121, subsample=0.8394633936788146; total time=  46.3s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.679822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 1886734, number of used features: 15
[LightGBM] [Info] Start training from score 11.105223
[CV] END colsample_bytree=0.6571467271687763, learning_rate=0.07508884729488528, max_dept



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.680992 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1428
[LightGBM] [Info] Number of data points in the train set: 1886734, number of used features: 15
[LightGBM] [Info] Start training from score 11.105223
[CV] END colsample_bytree=0.6624074561769746, learning_rate=0.025599452033620268, max_depth=5, n_estimators=137, subsample=0.7334834444556088; total time=  51.9s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.691539 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1701
[LightGBM] [Info] Number of data points in the train set: 1886734, number of used features: 15
[LightGBM] [Info] Start training from score 12.031026
[CV] END colsample_bytree=0.6571467271687763, learning_rate=0.07508884729488528, max_depth=3, n_estimators=51, subsample=0.8887995089067299; total t

  test_data.fillna(method='ffill', inplace=True)
  test_data.fillna(method='bfill', inplace=True)
  test_data['age'] = pd.to_datetime(test_data['date']).dt.year - pd.to_datetime(test_data['dob']).dt.year
  test_data['age'] = pd.to_datetime(test_data['date']).dt.year - pd.to_datetime(test_data['dob']).dt.year


