In [96]:
import pandas as pd
import numpy as np

In [97]:
# Exploratory Data Analysis (EDA)

#df = pd.read_csv("top_21_features_RF.csv")

df = pd.read_csv("top_feature_names.csv")

#df = pd.read_csv("train.csv")

In [98]:
df.columns

Index(['X314', 'X315', 'X119', 'X263', 'X118', 'X127', 'X136', 'y'], dtype='object')

In [99]:
df.head()

Unnamed: 0,X314,X315,X119,X263,X118,X127,X136,y
0,0,0,1,1,1,0,1,130.81
1,0,0,1,1,1,1,1,88.53
2,0,0,0,0,0,0,0,76.26
3,0,0,0,0,0,0,0,80.62
4,0,0,0,0,0,0,0,78.02


In [100]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Assume df contains your dataframe

# Extract features and target variable from the dataframe
X = df.drop(columns=['y'])
y = df['y']

# Identify categorical columns in the dataset
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = XGBRegressor()

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)


rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print("RMSE on training set:", rmse_train)
print("RMSE on test set:", rmse_test)

# Calculate R^2 score for training set
r2_train = r2_score(y_train, y_pred_train)

# Calculate R^2 score for test set
r2_test = r2_score(y_test, y_pred_test)

print("R^2 score on training set:", np.round(r2_train*100,1),"%")
print("R^2 score on test set:", np.round(r2_test*100,1),"%")


RMSE on training set: 8.412021198069311
RMSE on test set: 7.978246729703271
R^2 score on training set: 56.3 %
R^2 score on test set: 59.1 %




In [101]:

# Apply preprocessing to the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [102]:
# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Create XGBoost regressor
xgb_model = XGBRegressor()

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train_processed, y_train)

# Get the best model and its parameters
best_xgb_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_parameters)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}


In [103]:
# Now, you can use the processed data for prediction
y_train_pred = best_xgb_model.predict(X_train_processed)
y_test_pred = best_xgb_model.predict(X_test_processed)

# Calculate RMSE and R^2 scores
RMSE_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
RMSE_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Training RMSE score: {RMSE_train}")
print(f"Testing RMSE score: {RMSE_test}")

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"\nTraining R^2 score: {np.round(r2_train*100,1)}%")
print(f"Testing R^2 score: {np.round(r2_test*100,1)}%")

Training RMSE score: 8.446219907946032
Testing RMSE score: 7.905551673730968

Training R^2 score: 56.0%
Testing R^2 score: 59.8%


In [104]:
# predict test

# final_test_df = pd.read_csv("test.csv")

# X_test = final_test_df
# X_test_encoded = preprocessor.transform(X_test)

# y_test_pred = best_xgb_model.predict(X_test_encoded)
# y_test_pred


# results_df = pd.DataFrame({"ID": X_test["ID"],"y": y_test_pred})

# # Save the DataFrame to a CSV file
# results_df.to_csv("xgboost_all_features.csv", index=False)



In [105]:
# Result with all features: 
# Training RMSE score: 7.987454825841536
# Testing RMSE score: 7.8865637890695455

# Training R^2 score: 60.6%
# Testing R^2 score: 60.0%

# Result with just top 21 features: 
# Training RMSE score: 8.154930588873281
# Testing RMSE score: 7.818779802088263

# Training R^2 score: 58.9%
# Testing R^2 score: 60.7%


# Result with just top 7 features: 
# Training RMSE score: 8.446219907946032
# Testing RMSE score: 7.905551673730968

# Training R^2 score: 56.0%
# Testing R^2 score: 59.8%. But we got 54.65% in Submission!!

In [106]:
final_test_df = pd.read_csv("test.csv")

X_test = final_test_df
X_test_encoded = preprocessor.transform(X_test)

y_test_pred = best_xgb_model.predict(X_test_encoded)
y_test_pred

array([ 78.83769,  94.62347,  78.83769, ...,  94.39399, 112.24395,
        94.62347], dtype=float32)

In [107]:
results_df = pd.DataFrame({"ID": X_test["ID"],"y": y_test_pred})

# Save the DataFrame to a CSV file
results_df.to_csv("xgboost.csv", index=False)


# Conclusion: 

# Approx. 90.03% of variation is captures by top 7 Features

# Top 7 features: 'X314', 'X315', 'X119', 'X263', 'X118', 'X127', 'X136'