In [1]:
# import normal libraries
import pandas as pd
import numpy as np

In [2]:
# import datasets
test_data = pd.read_parquet('/kaggle/input/volatility-smile-prediction/test_data.parquet') # add your test data file path
train_data = pd.read_parquet('/kaggle/input/volatility-smile-prediction/train_data.parquet') # add your train data file path
sample_data = pd.read_csv('/kaggle/input/volatility-smile-prediction/sample_submission.csv') # add your sample submission file path

In [3]:
# Convert data types to float32 for memory efficiency
test_data = test_data.astype('float32')

In [4]:
# Select predictors: X0 to X41 + underlying + timestamp
predictor_cols = [f"X{i}" for i in range(42)] + ["underlying"]
predictors = test_data[predictor_cols]

# Columns to impute (those with missing values)
iv_cols = test_data.columns[test_data.isnull().any()].tolist()

# Combine predictors and targets
impute_df = pd.concat([predictors, test_data[iv_cols]], axis=1)

In [5]:
# Display the first few rows of the impute_df DataFrame
impute_df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,put_iv_24600,put_iv_24700,put_iv_24800,put_iv_24900,put_iv_25000,put_iv_25100,put_iv_25200,put_iv_25300,put_iv_25400,put_iv_25500
0,-2377420.0,-0.066961,-0.064562,-0.068452,23078.869141,117442.710938,-55935.98,-0.0,-0.001486,0.0,...,0.232334,,,0.227301,0.234169,,0.250422,,,0.282229
1,6703629.0,0.030899,-0.031677,-0.033302,35119.902344,-153269.53125,-108493.2,0.0,0.002381,-0.0,...,,0.228209,,,,0.249402,,,,
2,2245442.25,-0.066279,-0.162606,-0.233426,-77723.03125,26958.333984,474096.7,0.0,0.000682,-0.0,...,0.194612,0.188052,,,0.181346,,0.19075,,,
3,-17185772.0,-0.046657,-0.497496,-0.321425,227847.546875,-114322.914062,130835.3,0.0,-0.000204,0.0,...,,0.166394,0.161561,,,0.172032,,,,0.206107
4,7473684.0,-0.440429,0.554007,-0.31975,481455.71875,9713.541992,1064944.0,0.0,0.000594,-0.0,...,,,,0.17792,,0.176,,,0.182314,


In [6]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

# Define the Extra Trees Regressor with specific parameters
reg_et = ExtraTreesRegressor(
    n_estimators=800,
    max_depth=35,
    max_features=0.2,
    bootstrap=False,
    criterion='squared_error',
    min_samples_leaf=2,
    random_state=0,
    n_jobs=-1,
    warm_start=False
)

# Initialize the Iterative Imputer with the Extra Trees Regressor
imputer_et = IterativeImputer(
    estimator=reg_et,
    max_iter=25,
    tol=0,
    verbose=2,
    random_state=0,
    skip_complete=True,
    imputation_order='roman'
)

# Run imputation
final = imputer_et.fit_transform(impute_df)

[IterativeImputer] Completing matrix with shape (12065, 95)
[IterativeImputer] Ending imputation round 1/25, elapsed time 72.64
[IterativeImputer] Change: 2.1457180976867676, scaled tolerance: 0.0 
[IterativeImputer] Ending imputation round 2/25, elapsed time 144.98
[IterativeImputer] Change: 0.09855930507183075, scaled tolerance: 0.0 
[IterativeImputer] Ending imputation round 3/25, elapsed time 218.02
[IterativeImputer] Change: 0.03522367775440216, scaled tolerance: 0.0 
[IterativeImputer] Ending imputation round 4/25, elapsed time 290.97
[IterativeImputer] Change: 0.028398677706718445, scaled tolerance: 0.0 
[IterativeImputer] Ending imputation round 5/25, elapsed time 364.67
[IterativeImputer] Change: 0.02082820236682892, scaled tolerance: 0.0 
[IterativeImputer] Ending imputation round 6/25, elapsed time 439.01
[IterativeImputer] Change: 0.016025185585021973, scaled tolerance: 0.0 
[IterativeImputer] Ending imputation round 7/25, elapsed time 513.19
[IterativeImputer] Change: 0.01



In [14]:
# Convert the result back to a DataFrame with the same columns as impute_df
final = pd.DataFrame(final, columns=impute_df.columns)

# Display the first few rows of the final DataFrame
final.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,put_iv_24600,put_iv_24700,put_iv_24800,put_iv_24900,put_iv_25000,put_iv_25100,put_iv_25200,put_iv_25300,put_iv_25400,put_iv_25500
0,-2377420.0,-0.066961,-0.064562,-0.068452,23078.869141,117442.710938,-55935.98,-0.0,-0.001486,0.0,...,0.232334,0.226131,0.222216,0.227301,0.234169,0.244403,0.250422,0.258141,0.273116,0.282229
1,6703629.0,0.030899,-0.031677,-0.033302,35119.902344,-153269.53125,-108493.2,0.0,0.002381,-0.0,...,0.234148,0.228209,0.224931,0.230033,0.239004,0.249402,0.258144,0.264064,0.271621,0.284078
2,2245442.25,-0.066279,-0.162606,-0.233426,-77723.03125,26958.333984,474096.7,0.0,0.000682,-0.0,...,0.194612,0.188052,0.183017,0.180678,0.181346,0.185132,0.19075,0.196855,0.204597,0.212107
3,-17185772.0,-0.046657,-0.497496,-0.321425,227847.546875,-114322.914062,130835.3,0.0,-0.000204,0.0,...,0.173742,0.166394,0.161561,0.160448,0.164596,0.172032,0.181169,0.189764,0.197595,0.206107
4,7473684.0,-0.440429,0.554007,-0.31975,481455.71875,9713.541992,1064944.0,0.0,0.000594,-0.0,...,0.192943,0.186458,0.181847,0.17792,0.17575,0.176,0.177532,0.178832,0.182314,0.186403


In [16]:
# Prepare the final DataFrame for submission
col = list(final.columns[0:43])

# Drop the first 43 columns (X0 to X42 + underlying) and keep the timestamp
df = final.drop(columns = col , axis = 1)

# Convert the data types of the final DataFrame to float64 for consistency
df = df.astype('float64')

# Add timestamp in final submission
df = pd.concat([test_data['timestamp'] , df] , axis =1)

# Change timestamp float32 -> int
df['timestamp'] = df['timestamp'].astype('int')

# Display the shape of the final DataFrame
print("Final DataFrame shape:", df.shape)

# Display the first few rows of the final DataFrame
df.head()

Final DataFrame shape: (12065, 53)


Unnamed: 0,timestamp,call_iv_24000,call_iv_24100,call_iv_24200,call_iv_24300,call_iv_24400,call_iv_24500,call_iv_24600,call_iv_24700,call_iv_24800,...,put_iv_24600,put_iv_24700,put_iv_24800,put_iv_24900,put_iv_25000,put_iv_25100,put_iv_25200,put_iv_25300,put_iv_25400,put_iv_25500
0,0,0.280939,0.267904,0.257148,0.250008,0.242149,0.238095,0.232439,0.226047,0.222997,...,0.232334,0.226131,0.222216,0.227301,0.234169,0.244403,0.250422,0.258141,0.273116,0.282229
1,1,0.270276,0.268842,0.258893,0.25023,0.244276,0.238992,0.233548,0.22794,0.225056,...,0.234148,0.228209,0.224931,0.230033,0.239004,0.249402,0.258144,0.264064,0.271621,0.284078
2,2,0.256432,0.251731,0.23665,0.224854,0.214869,0.20458,0.194604,0.18821,0.1832,...,0.194612,0.188052,0.183017,0.180678,0.181346,0.185132,0.19075,0.196855,0.204597,0.212107
3,3,0.241888,0.23019,0.220505,0.208795,0.198602,0.18619,0.174353,0.166617,0.161614,...,0.173742,0.166394,0.161561,0.160448,0.164596,0.172032,0.181169,0.189764,0.197595,0.206107
4,4,0.235328,0.230086,0.222983,0.214126,0.20614,0.199226,0.192603,0.186454,0.181901,...,0.192943,0.186458,0.181847,0.17792,0.17575,0.176,0.177532,0.178832,0.182314,0.186403


In [17]:
# Save the final DataFrame to a CSV file for submission
df.to_csv('submission.csv', index=False)
print("Done...")

Done...
