In [8]:
import numpy as np
import pandas as pd

X_train_df = pd.read_csv('./data/X_train.csv', skiprows=1, header=None)
y_train_df = pd.read_csv('./data/y_train.csv', skiprows=1, header=None)
X_test_df = pd.read_csv('./data/X_test.csv', skiprows=1, header=None)

X_train = X_train_df.values[:, 1:]
y_train = y_train_df.values[:, 1:].ravel()
X_test  = X_test_df.values[:, 1:]

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=0.2, random_state=42)
print("X_train: ", X_train.shape, "X_val: ", X_val.shape)

X_train:  (969, 832) X_val:  (243, 832)


Imputing with mean values

In [10]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
X_train_imputed = mean_imputer.fit_transform(X_train)

Outlier Detection with LOF

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
outlier_labels = lof.fit_predict(X_train_scaled)

X_train_cleaned = X_train_imputed[outlier_labels == 1] # imputed data after detecting
X_train = X_train[outlier_labels == 1] # original data after detecting
y_train = y_train[outlier_labels == 1]

print("Filtered X_train shape:", X_train_cleaned.shape)
print("Filtered y_train shape:", y_train.shape)

Filtered X_train shape: (920, 832)
Filtered y_train shape: (920,)


Feature selection f-scores

In [14]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X_train_cleaned, y_train.ravel())

p_values = selector.pvalues_
scores = selector.scores_

significant_feature_indices = np.where(p_values < 0.05)[0]

print(f"Number of significant features: {len(significant_feature_indices)}")

# Extract selected features from the original data
X_train_selected = X_train[:, significant_feature_indices]
X_val_selected = X_val[:, significant_feature_indices]
X_test_selected = X_test[:, significant_feature_indices]

Number of significant features: 230


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Impute missing values in the selected features

In [15]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iterative_imputer = IterativeImputer()
X_train_selected_imputed = iterative_imputer.fit_transform(X_train_selected)
X_val_selected_imputed = iterative_imputer.transform(X_val_selected)
X_test_selected_imputed = iterative_imputer.transform(X_test_selected)

Scaling

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected_imputed)
X_val_scaled = scaler.transform(X_val_selected_imputed)
X_test_scaled = scaler.transform(X_test_selected_imputed)

In [18]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05, 
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1, 
    reg_alpha=0.5, 
    reg_lambda=1.2,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

y_train_pred = xgb_model.predict(X_train_scaled)
y_val_pred = xgb_model.predict(X_val_scaled)
y_test_pred = xgb_model.predict(X_test_scaled)

print("Train R^2 Score:", r2_score(y_train, y_train_pred))
print("Validation R^2 Score:", r2_score(y_val, y_val_pred))

Train R^2 Score: 0.9997811445232354
Validation R^2 Score: 0.547337378272673


Overfitting is quite serious, but no better solution has been found yet...

CVS

In [None]:
table = pd.DataFrame({'id': np.arange(0, y_test_pred.shape[0]), 'y': y_test_pred.flatten()})
print(table.shape)
table.to_csv('./data/predictions/y_test_pred.csv', index=False)