In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # To split the data.
from sklearn.linear_model import LinearRegression # Linear Regression model.
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading the data.

In [None]:
train_data = pd.read_csv('/kaggle/input/playground-series-s3e11/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e11/test.csv')
train_data.head()

In [None]:
train_data.shape

## Vizualing the training data set.

In [None]:
def draw_histogram_for_column(dataframe, column):
    # Plot a histogram
    sns.histplot(data=dataframe, x=column, bins=20)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title('Histogram of '+column)
    plt.show()

In [None]:
# for column in train_data.columns.tolist():
#     draw_histogram_for_column(train_data, column)

In [None]:
# Create a correlation matrix
correlation_matrix = train_data.corr()

# Plot a heatmap
plt.figure(figsize=(15, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
correlation_matrix['cost']

In [None]:
# Assuming df is your DataFrame with the data
threshold = 0.005  # You can adjust this threshold as needed

# Calculate correlations
correlations = correlation_matrix['cost']

# Filter features based on the threshold
low_corr_features = correlations[(correlations < threshold) & (correlations > -threshold)].index

# Drop the low correlation features from the DataFrame
df_filtered = train_data.drop(low_corr_features, axis=1)
test_data_filtered = test_data.drop(low_corr_features, axis=1)
df_filtered.head()

In [None]:
# X = train_data.drop(columns=['id','cost']).values  # Features
# y = train_data['cost'].values  # Labels

X = df_filtered.drop(columns=['cost']).values
y = df_filtered['cost'].values

In [None]:
# Splitting the data into training (80%) and validation (20%) sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# test_data.head()
test_data_filtered.head()

## Scaling the dataset.

In [None]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
# test_data_scaler = scaler.transform(test_data.drop(columns=['id']))
test_data_scaler = scaler.transform(test_data_filtered)

## Grid search

In [None]:
# model = XGBRegressor(random_state=42)
# # Define the hyperparameter grid to search over
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5]
# }

# # Create the HalvingGridSearchCV object
# grid_search = HalvingGridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='neg_mean_squared_error',  # Choose an appropriate scoring metric
#     factor=3,  # Factor by which the size of the grid will be reduced
#     cv=5,
#     verbose=1,
#     random_state=42
# )

# # Fit the grid search to your training data
# grid_search.fit(X_train, y_train)

# # Get the best estimator and hyperparameters
# best_estimator = grid_search.best_estimator_
# best_params = grid_search.best_params_

# print("Best Estimator:")
# print(best_estimator)

# print("\nBest Hyperparameters:")
# print(best_params)

## Best parameters.

Best Hyperparameters:
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

## Using the XGBoost model.

In [None]:
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)


## Predicting for valid dataset.

In [None]:
y_pred_valid = model.predict(X_valid)

In [None]:
y_pred_valid.shape

In [None]:
y_valid.shape

In [None]:
# Assuming you have predictions y_pred and actual values y_valid
mse = mean_squared_error(y_valid, y_pred_valid)
r2 = r2_score(y_valid, y_pred_valid)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

## Accuracy summary(MSE).
- XGboost model - 836.94
- Linear regression - 879.54

## Predictiing the values for the test dataset.

In [None]:
y_pred_test = model.predict(test_data_scaler)

In [None]:
output_df = pd.DataFrame(index=range(len(y_pred_test)))
output_df['id'] = test_data['id']
output_df['cost'] = y_pred_test
output_df

In [None]:
output_df.to_csv('/kaggle/working/output.csv', index=False)