## Data Preprocessing and Feature Selection

In [7]:
import os
import random
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [8]:
with open('dropped_features.txt', 'w') as f:
    f.write("=== Dropped Features Log ===\n\n")

# Load training data
train_data = pd.read_csv('train.csv')
X_train = train_data.iloc[:, :-1]  # All columns except target
Y_train = train_data.iloc[:, -1]   # Target column

# Load test data
test_data = pd.read_csv('test.csv')
test_ids = test_data['Id']             # Save IDs for later
X_test = test_data.drop('Id', axis=1)  # Remove ID column

In [9]:
# Print the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Target variable shape: {Y_train.shape}")

Training data shape: (8250, 40)
Test data shape: (5500, 40)
Target variable shape: (8250,)


In [10]:
# Check size
n_samples, n_features = X_train.shape
print(f"Training set: {n_samples} samples, {n_features} features")

n_samples_test, n_features_test = X_test.shape
print(f"Test set: {n_samples_test} samples, {n_features_test} features")

# Check missing values
missing_values = X_train.isnull().sum()
if missing_values[missing_values > 0].empty:
    print("No missing values in training data.")
else:
    print("Missing values in training data:\n", missing_values[missing_values > 0])

missing_values_test = X_test.isnull().sum()
if missing_values_test[missing_values_test > 0].empty:
    print("No missing values in test data.")
else:
    print("Missing values in test data:\n", missing_values_test[missing_values_test > 0])

# Check duplicate rows
duplicate_rows = X_train.duplicated().sum()
print(f"Number of duplicate rows in training data: {duplicate_rows}")

duplicate_rows_test = X_test.duplicated().sum()
print(f"Number of duplicate rows in test data: {duplicate_rows_test}")

# Check constant features
constant_features = X_train.columns[X_train.nunique() <= 1]
if constant_features.empty:
    print("No constant features in training data.")
else:
    print("Constant features in training data:\n", constant_features.tolist())

constant_features_test = X_test.columns[X_test.nunique() <= 1]
if constant_features_test.empty:
    print("No constant features in test data.")
else:
    print("Constant features in test data:\n", constant_features_test.tolist())

Training set: 8250 samples, 40 features
Test set: 5500 samples, 40 features
No missing values in training data.
No missing values in test data.
Number of duplicate rows in training data: 0
Number of duplicate rows in test data: 0
No constant features in training data.
No constant features in test data.


In [11]:
# Feature selection based on training data only
p_values = []
correlations = []
for col in X_train.columns:
    corr, p_val = stats.pearsonr(X_train[col], Y_train)
    correlations.append(corr)
    p_values.append(p_val)
print("Feature correlations with target variable:")
print(correlations)
print("Feature p-values with target variable:")
print(p_values)

Feature correlations with target variable:
[0.07964190054974579, -0.06128885266004798, 0.33442013907309565, 0.3969744453356405, 0.3080799621532907, 0.10601459861929727, -0.7045147870681104, -0.056970294145277905, -0.052774188141616725, -0.07356026312089133, 0.6413120025165503, 0.6404046428202611, 0.6404046428202611, 0.6403982947247879, 0.6404133411853214, 0.6388169578630332, 0.6388288800726144, 0.6363738722472071, 0.6363738722472071, 0.6335668974934874, 0.6337070768152012, 0.630812268208842, 0.630761169874003, 0.6283347894625324, 0.03530695033232792, -0.009046901236544908, 0.016616528910079087, -0.019433185595649934, 0.010640925795167937, -0.019433185595649934, 0.03211506117122081, -0.017202457805639306, 0.0402020337875995, -0.0028748441628541066, 0.027329701051732613, -0.008917829140455918, 0.025826204301336295, -0.024296226752424314, 0.6159202211674143, 0.6282719851321914]
Feature p-values with target variable:
[4.348468950928982e-13, 2.5294625218109232e-08, 1.0073647425945063e-214, 

In [12]:
# # Plot the correlation matrix
# import seaborn as sns
# import matplotlib.pyplot as plt
# import matplotlib.dates as mdates

# Xy_train = X_train.copy()
# Xy_train['target'] = Y_train

# plt.figure(figsize=(14, 10))
# sns.heatmap(Xy_train.corr(), annot=False, fmt=".2f", cmap='coolwarm', cbar=True, square=True)
# plt.title('Correlation Matrix')
# plt.xticks(rotation=45, ha='right')
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.savefig('correlation_matrix.png')
# plt.show()

In [13]:
feature_stats = pd.DataFrame({
    'Feature': X_train.columns,
    'Correlation': correlations,
    'P-value': p_values
})

feature_stats['Abs_Correlation'] = abs(feature_stats['Correlation'])
feature_stats = feature_stats.sort_values('Abs_Correlation', ascending=False)
feature_stats = feature_stats.drop('Abs_Correlation', axis=1)

with open('dropped_features.txt', 'a') as f:
    f.write("Feature Correlation Analysis:\n")
    f.write(feature_stats.to_string())
    f.write("\n\n")

print("Feature Correlation Analysis:")
print(feature_stats)

Feature Correlation Analysis:
             Feature  Correlation        P-value
6     absoluate_roll    -0.704515   0.000000e+00
10             time1     0.641312   0.000000e+00
14             time5     0.640413   0.000000e+00
11             time2     0.640405   0.000000e+00
12             time3     0.640405   0.000000e+00
13             time4     0.640398   0.000000e+00
16             time7     0.638829   0.000000e+00
15             time6     0.638817   0.000000e+00
18             time9     0.636374   0.000000e+00
17             time8     0.636374   0.000000e+00
20            time11     0.633707   0.000000e+00
19            time10     0.633567   0.000000e+00
21            time12     0.630812   0.000000e+00
22            time13     0.630761   0.000000e+00
23            time14     0.628335   0.000000e+00
39               set     0.628272   0.000000e+00
38             omega     0.615920   0.000000e+00
3                  n     0.396974  1.608501e-309
2                  m     0.334420  1.00

In [14]:
# Step 1: Drop features with p-value > 0.05
cols_to_drop_1 = X_train.columns[np.array(p_values) > 0.05] # Drop features with p-value > 0.05
with open('dropped_features.txt', 'a') as f:
    f.write("Step 1 - Dropped features (p-value > 0.05):\n")
    f.write(", ".join(cols_to_drop_1) + "\n\n")

X_train_filtered = X_train.drop(columns=cols_to_drop_1)
feature_names = X_train_filtered.columns.tolist()

# Step 2: Analyze feature correlations
p_value_matrix = np.zeros((X_train_filtered.shape[1], X_train_filtered.shape[1]))
for i in range(X_train_filtered.shape[1]):
    for j in range(X_train_filtered.shape[1]):
        _, p_val = stats.pearsonr(X_train_filtered.iloc[:, i], X_train_filtered.iloc[:, j])
        p_value_matrix[i, j] = p_val

p_value_counts = np.sum(p_value_matrix == 0, axis=1)

with open('dropped_features.txt', 'a') as f:
    f.write("Step 2 - Count of zero p-values between features:\n")
    for name, count in zip(feature_names, p_value_counts):
        f.write(f"{name}: {count}\n")
    f.write("\n")

cols_to_drop_indices = np.where(p_value_counts >= 10)[0]
cols_to_drop_2 = [
    feature_names[i] 
    for i in cols_to_drop_indices 
    if feature_names[i] != 'time1'
]

with open('dropped_features.txt', 'a') as f:
    f.write("Step 2 - Dropped features:\n")
    f.write(", ".join(cols_to_drop_2) + "\n\n")

# Apply the same feature dropping to both train and test
X_train_final = X_train_filtered.drop(columns=cols_to_drop_2)
X_test_final = X_test.drop(columns=list(cols_to_drop_1) + cols_to_drop_2)

# Save the list of final features for future reference
final_features = X_train_final.columns.tolist()
with open('final_features.txt', 'w') as f:
    f.write('\n'.join(final_features))

In [15]:
# Feature scaling
standard_scaler_features = ['acc_rate', 'track', 'current_roll', 'climb_delta', 'roll_rate_delta', 'climb_delta_diff']
robust_scaler_features = ['m', 'absoluate_roll', 'time1']
minmax_scaler_features = ['time1_delta', 'time7_delta', 'time9_delta', 'time11_delta', 'time13_delta', 'time14_delta']

# Filter scaling feature lists to only include columns that remain after feature selection
standard_scaler_features = [f for f in standard_scaler_features if f in X_train_final.columns]
robust_scaler_features = [f for f in robust_scaler_features if f in X_train_final.columns]
minmax_scaler_features = [f for f in minmax_scaler_features if f in X_train_final.columns]

# Create column transformer for feature scaling
column_trans = ColumnTransformer([
    ("standard", StandardScaler(), standard_scaler_features),
    ("robust", RobustScaler(), robust_scaler_features),
    ("minmax", MinMaxScaler(feature_range=(-1,1)), minmax_scaler_features)
], remainder='passthrough')

# Fit the transformer on training data only
column_trans.fit(X_train_final)

# Save the transformer for future use
with open('feature_transformer.pkl', 'wb') as f:
    pickle.dump(column_trans, f)

# Transform both train and test data
X_train_scaled = column_trans.transform(X_train_final)
X_test_scaled = column_trans.transform(X_test_final)

# Get the column names in the correct order after transformation
transformed_columns = (
    standard_scaler_features + 
    robust_scaler_features + 
    minmax_scaler_features + 
    [col for col in X_train_final.columns if col not in 
     standard_scaler_features + robust_scaler_features + minmax_scaler_features]
)

In [16]:
# Save processed datasets
pd.DataFrame(X_train_scaled, columns=transformed_columns).to_csv('X_train_processed.csv', index=False)
Y_train.to_csv('Y_train_processed.csv', index=False)
X_test_df = pd.DataFrame(X_test_scaled, columns=transformed_columns)
X_test_df.to_csv('X_test_processed.csv', index=False)

# Save full preprocessed datasets with proper IDs for training and submission
train_processed = pd.DataFrame(X_train_scaled, columns=transformed_columns)
train_processed['target'] = Y_train.values
train_processed.to_csv('train_processed.csv', index=False)

test_processed = pd.DataFrame(X_test_scaled, columns=transformed_columns)
test_processed.insert(0, 'Id', test_ids)  # Add Id column back at the beginning
test_processed.to_csv('test_processed.csv', index=False)

print(f"Processed {X_train_scaled.shape[0]} training samples with {X_train_scaled.shape[1]} features")
print(f"Processed {X_test_scaled.shape[0]} test samples with {X_test_scaled.shape[1]} features")
print("Feature selection and scaling complete. Files saved.")

Processed 8250 training samples with 15 features
Processed 5500 test samples with 15 features
Feature selection and scaling complete. Files saved.


In [17]:
# Print the size of the final datasets
print(f"Final training data shape: {X_train_scaled.shape}")
print(f"Final test data shape: {X_test_scaled.shape}")
print(f"Final features: {transformed_columns}")
print(f"Number of features after selection: {len(transformed_columns)}")
print(f"Number of features dropped: {len(X_train.columns) - len(transformed_columns)}")

Final training data shape: (8250, 15)
Final test data shape: (5500, 15)
Final features: ['acc_rate', 'track', 'current_roll', 'climb_delta', 'roll_rate_delta', 'climb_delta_diff', 'm', 'absoluate_roll', 'time1', 'time1_delta', 'time7_delta', 'time9_delta', 'time11_delta', 'time13_delta', 'time14_delta']
Number of features after selection: 15
Number of features dropped: 25


## Training Model and Hyperparameter Tuning

In [24]:
# Create best_model folder if not exists
os.makedirs('best_model', exist_ok=True)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
train_processed = pd.read_csv('train_processed.csv')
X = train_processed.iloc[:, :-1].values
Y = train_processed.iloc[:, -1].values.reshape(-1, 1)

test_processed = pd.read_csv('test_processed.csv')
test_ids = test_processed['Id'].values
X_test = test_processed.drop('Id', axis=1).values

print(f"Training data shape: {X.shape}")
print(f"Test data shape: {X_test.shape}")

# Define model with Dropout and Batch Normalization
class RegressionANN(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.BatchNorm1d(64),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.Tanh(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.layer(x)

# Random search space
def sample_hyperparams():
    return {
        'learning_rate': 10**random.uniform(-4, -2),
        'batch_size': random.choice([8, 16, 32, 64]),
        'weight_decay': 10**random.uniform(-5, -3)
    }

# Hyperparameters
total_epochs = 500
patience = 30
n_random_search = 10  # Number of random trials

best_global_rmse = np.inf
best_hyperparams = None
random_search_results = []

Using device: cpu
Training data shape: (8250, 15)
Test data shape: (5500, 15)


In [None]:
for trial in range(n_random_search):
    print(f"\nRandom Search Trial {trial+1}/{n_random_search}")
    params = sample_hyperparams()
    learning_rate = params['learning_rate']
    batch_size = params['batch_size']
    weight_decay = params['weight_decay']

    print(f"Trying hyperparams: lr={learning_rate:.5f}, batch_size={batch_size}, weight_decay={weight_decay:.5f}")

    # KFold Cross Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold = 1
    val_rmse_list = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(Y_train, dtype=torch.float32))
        val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(Y_val, dtype=torch.float32))

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        model = RegressionANN(X.shape[1]).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        criterion = nn.MSELoss()

        best_val_rmse = np.inf
        patience_counter = 0

        for epoch in range(total_epochs):
            model.train()
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

            model.eval()
            val_preds = []
            val_targets = []
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    outputs = model(inputs)
                    val_preds.append(outputs.cpu().numpy())
                    val_targets.append(targets.cpu().numpy())

            val_preds = np.vstack(val_preds)
            val_targets = np.vstack(val_targets)

            val_rmse = root_mean_squared_error(val_targets, val_preds)

            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                patience_counter = 0
            else:
                patience_counter += 1

            if patience_counter >= patience:
                break

        val_rmse_list.append(best_val_rmse)
        fold += 1

    avg_val_rmse = np.mean(val_rmse_list)
    random_search_results.append((params, avg_val_rmse))
    print(f"Avg Validation RMSE for this setting: {avg_val_rmse:.5f}")

    if avg_val_rmse < best_global_rmse:
        best_global_rmse = avg_val_rmse
        best_hyperparams = params

print(f"\nBest Hyperparameters Found: {best_hyperparams}")
print(f"Best Validation RMSE: {best_global_rmse:.5f}")

In [37]:
# # Plot Hyperparameters vs Validation RMSE
# lrs = [np.log10(r[0]['learning_rate']) for r in random_search_results]
# weight_decays = [np.log10(r[0]['weight_decay']) for r in random_search_results]
# batch_sizes = [r[0]['batch_size'] for r in random_search_results]
# val_rmses = [r[1] for r in random_search_results]

# plt.figure()
# plt.scatter(lrs, val_rmses)
# plt.xlabel('log10(Learning Rate)')
# plt.ylabel('Validation RMSE')
# plt.title('Learning Rate vs Validation RMSE')
# plt.show()

# plt.figure()
# plt.scatter(weight_decays, val_rmses)
# plt.xlabel('log10(Weight Decay)')
# plt.ylabel('Validation RMSE')
# plt.title('Weight Decay vs Validation RMSE')
# plt.show()

# plt.figure()
# plt.scatter(batch_sizes, val_rmses)
# plt.xlabel('Batch Size')
# plt.ylabel('Validation RMSE')
# plt.title('Batch Size vs Validation RMSE')
# plt.show()

In [None]:
# # 定义 best_hyperparams
# best_hyperparams = {
#     'learning_rate': 0.0050705147268348535,
#     'batch_size': 64,
#     'weight_decay': 2.0472230821749646e-05
# }

# # 转为 DataFrame 并保存为 CSV
# df = pd.DataFrame([best_hyperparams])
# df.to_csv('best_hyperparams.csv', index=False)

In [3]:
# # 读取 CSV
# df = pd.read_csv('best_hyperparams.csv')

# # 提取参数
# best_hyperparams = df.iloc[0].to_dict()

In [2]:
# Retrain with best hyperparameters
learning_rate = best_hyperparams['learning_rate']
batch_size = best_hyperparams['batch_size']
weight_decay = best_hyperparams['weight_decay']

kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_test_preds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    Y_train, Y_val = Y[train_idx], Y[val_idx]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(Y_train, dtype=torch.float32))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = RegressionANN(X.shape[1]).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.MSELoss()

    best_val_rmse = np.inf
    patience_counter = 0

    for epoch in range(total_epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_inputs = torch.tensor(X_val, dtype=torch.float32).to(device)
            val_targets = torch.tensor(Y_val, dtype=torch.float32).to(device)
            outputs = model(val_inputs)
            val_preds = outputs.cpu().numpy()
            val_targets = val_targets.cpu().numpy()

        val_rmse = root_mean_squared_error(val_targets, val_preds)

        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            patience_counter = 0
            torch.save(model.state_dict(), f'best_model/final_best_model_fold{fold}.pt')
        else:
            patience_counter += 1

        if patience_counter >= patience:
            break

    model.load_state_dict(torch.load(f'best_model/final_best_model_fold{fold}.pt'))
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
        test_pred = model(X_test_tensor).cpu().numpy()
        all_test_preds.append(test_pred)

# Ensemble prediction
test_pred_final = np.mean(all_test_preds, axis=0)

# Save submission
submission = pd.DataFrame({
    'Id': test_ids,
    'target': test_pred_final.flatten()
})
submission.to_csv('submission.csv', index=False)
print("\nFinal predictions saved to submission.csv")

NameError: name 'best_hyperparams' is not defined