In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp '/content/drive/MyDrive/kaggle.json' ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c playground-series-s3e25
!unzip playground-series-s3e25.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('train.csv', index_col='id')
data.head()

In [None]:
print(data.isnull().any())
print(f'Duplicates?: {data.duplicated().any()}')
#data.drop_duplicates(inplace=True)

In [None]:
data.hist()

In [None]:
data.describe()

In [None]:
sns.heatmap(data.corr(), annot=True)
data.corr()

In [None]:
#data.drop(['atomicweight_Average', 'density_Average'], axis=1, inplace=True)
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 12))

# Loop through each column in the data
for i, column in enumerate(data.columns):
    # Create a subplot for each column
    # The arguments are (nrows, ncols, index)
    plt.subplot(2, 6, i+1)
    sns.boxplot(y=data[column])
    plt.title(column)

plt.tight_layout()  # Adjusts the layout
plt.show()

In [None]:
from scipy.stats.mstats import winsorize

#winsorize(data.allelectrons_Total, limits=0.025, inplace=True)
for i in data.columns.to_list():
  if (i!='Hardness'):
    winsorize(data[i], limits=0.1, inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X = data.iloc[:, :-1]
y = data.iloc[:, -1]
y.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
# Initialize the scaler
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

# Convert the scaled arrays back into pandas DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

In [None]:
X_train_scaled.head()

# Predicting

### 1 approach

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Making predictions
y_pred = model.predict(X_test_scaled)

# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Initialize models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'MLPRegressor': MLPRegressor()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    print(f'{name} trained.')

    # Predict on test set
    y_pred = model.predict(X_test_scaled)

    # Evaluate model
    results[name] = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': mean_squared_error(y_test, y_pred, squared=False),
        'R2': r2_score(y_test, y_pred)
    }

# Output results
for name, metrics in results.items():
    print(f'Results for {name}:')
    for metric, value in metrics.items():
        print(f'    {metric}: {value:.4f}')

# Hyperparameter tuning for Random Forest
parameters = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf_clf = GridSearchCV(RandomForestRegressor(), parameters, scoring='neg_mean_squared_error')
rf_clf.fit(X_train_scaled, y_train)

# Output best parameters and model
print(f'Best parameters for Random Forest: {rf_clf.best_params_}')
best_rf_model = rf_clf.best_estimator_

# Evaluate the best Random Forest model
best_rf_y_pred = best_rf_model.predict(X_test_scaled)
print(f'Random Forest - MAE: {mean_absolute_error(y_test, best_rf_y_pred)}')
print(f'Random Forest - MSE: {mean_squared_error(y_test, best_rf_y_pred)}')
print(f'Random Forest - RMSE: {mean_squared_error(y_test, best_rf_y_pred, squared=False)}')
print(f'Random Forest - R2: {r2_score(y_test, best_rf_y_pred)}')


In [None]:
import pandas as pd

# Step 1: Load the test data
test_data = pd.read_csv('test.csv')
test_data.drop(['atomicweight_Average', 'density_Average'], axis=1, inplace=True)

# Step 2: Preprocess and scale the test data
# (Assuming that 'id' is the first column and should not be scaled)
test_features = test_data.iloc[:, 1:]  # Exclude 'id' column
test_features_scaled = scaler.transform(test_features)  # Use the same scaler as before

# Step 3: Predict 'Hardness' using the best model
test_predictions = best_rf_model.predict(test_features_scaled)

# Step 4: Create a submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'Hardness': test_predictions
})

# Step 5: Save the predictions to a CSV file
submission.to_csv('submission.csv', index=False)

# Step 6: Submit the predictions to Kaggle
# kaggle competitions submit -c playground-series-s3e25 -f submission.csv -m "Message"

### 2 approach

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', device='cuda')

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # You can choose a different scorer
    cv=5,
    verbose=2,
    n_jobs=-1  # Use all available cores
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train_scaled, y_train)

# Find the best parameters and the best estimator
best_parameters = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f'Best parameters found: {best_parameters}')

# You can now evaluate the best_model on your test data and proceed with the predictions

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assuming the scaler has been previously fitted on the training data
# and best_xgb_model is the best model obtained from GridSearchCV

# Step 1: Load the test data
test_data = pd.read_csv('test.csv')

# Optional: If you need to drop certain columns as per preprocessing done during training
#test_data.drop(['atomicweight_Average', 'density_Average'], axis=1, inplace=True)

# Step 2: Preprocess and scale the test data
# (Assuming that 'id' is the first column and should not be scaled)
test_features = test_data.iloc[:, 1:]  # Exclude 'id' column
test_features_scaled = scaler.transform(test_features)  # Use the same scaler as before

# Step 3: Predict 'Hardness' using the best XGBoost model
test_predictions = best_model.predict(test_features_scaled)

# Step 4: Create a submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],  # Ensure this is the correct identifier column from your test set
    'Hardness': test_predictions
})

# Step 5: Save the predictions to a CSV file
submission.to_csv('xgb_submission_1.csv', index=False)

# The last step is to submit the predictions to Kaggle using the command line interface
# The command to submit to Kaggle would be run in your terminal, not in Python.
# Example command:
# kaggle competitions submit -c playground-series-s3e25 -f xgb_submission.csv -m "XGB Model Predictions"


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, median_absolute_error

# Make sure the training data is loaded and preprocessed (X_train_scaled, y_train)

# Step 1: Define the scorer based on Median Absolute Error
medae_scorer = make_scorer(median_absolute_error, greater_is_better=False)

# Step 2: Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Step 3: Initialize the GridSearchCV
xgb_model = XGBRegressor(objective='reg:squarederror', device='cuda')
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring=medae_scorer, cv=5, verbose=2)

# Step 4: Fit the GridSearchCV to the training data
grid_search.fit(X_train_scaled, y_train)

# Step 5: Print the best parameters and the best score
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best Median Absolute Error: {-grid_search.best_score_}')

# Step 6: Use the best estimator to make predictions on the test set
best_xgb_model = grid_search.best_estimator_

test_data = pd.read_csv('test.csv')

test_features = test_data.iloc[:, 1:]  # Exclude 'id' column
test_features_scaled = scaler.transform(test_features)  # Use the same scaler as before

test_predictions = best_xgb_model.predict(test_features_scaled)

# Create the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],  # Replace with the correct 'id' column from your test set
    'Hardness': test_predictions
})

# Save the submission file
submission.to_csv('xgb_submission.csv', index=False)

# Use the following command in your terminal to submit to Kaggle, not in Python
# kaggle competitions submit -c playground-series-s3e25 -f xgb_submission.csv -m "XGBRegressor with GridSearch"
