In [1]:
import pandas as pd
import numpy as np

In [48]:
# Exploratory Data Analysis (EDA)

df = pd.read_csv("top_21_features_RF.csv")
#df = pd.read_csv("train.csv")

In [43]:
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [52]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Assume df contains your dataframe

# Extract features and target variable from the dataframe
X = df.drop(columns=['y'])
y = df['y']

# Identify categorical columns in the dataset
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = XGBRegressor()

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)



rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print("RMSE on training set:", rmse_train)
print("RMSE on test set:", rmse_test)

# Calculate R^2 score for training set
r2_train = r2_score(y_train, y_pred_train)

# Calculate R^2 score for test set
r2_test = r2_score(y_test, y_pred_test)

print("R^2 score on training set:", np.round(r2_train*100,1),"%")
print("R^2 score on test set:", np.round(r2_test*100,1),"%")


RMSE on training set: 6.190848900102161
RMSE on test set: 8.532172445682125
R^2 score on training set: 76.3 %
R^2 score on test set: 53.2 %




In [55]:
# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Create XGBoost regressor
xgb_model = XGBRegressor()

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(X_train_processed, y_train)

# Get the best model and its parameters
best_xgb_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_parameters)

Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}


In [56]:

# Define the column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),  # numerical_cols are previously defined
        ('cat', OneHotEncoder(), categorical_cols)  # categorical_cols are previously defined
    ])

# Apply preprocessing to the data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Now, you can use the processed data for prediction
y_train_pred = best_xgb_model.predict(X_train_processed)
y_test_pred = best_xgb_model.predict(X_test_processed)

# Calculate RMSE and R^2 scores
RMSE_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
RMSE_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Training RMSE score: {RMSE_train}")
print(f"Testing RMSE score: {RMSE_test}")

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"\nTraining R^2 score: {np.round(r2_train*100,1)}%")
print(f"Testing R^2 score: {np.round(r2_test*100,1)}%")

Training RMSE score: 8.154930588873281
Testing RMSE score: 7.818779802088263

Training R^2 score: 58.9%
Testing R^2 score: 60.7%
