# Feature Engineering

In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA

# Load dataset
df = pd.read_csv("cnn_features_combined.csv")
df = df.drop(columns=["Id"], errors="ignore")

# Select only CNN feature vectors excluding metadata
cnn_features = df.filter(like="feature_", axis=1)

# Apply PCA to reduce dimensionality to 100 components
pca = PCA(n_components=100)
X_pca = pca.fit_transform(cnn_features)

# Combine PCA-transformed features with existing metadata
df_pca = pd.concat([df.drop(columns=cnn_features.columns), pd.DataFrame(X_pca)], axis=1)

# Split the dataset into training and testing sets
X_train_pca, X_test_pca, y_train, y_test = train_test_split(df_pca.drop(columns=["Pawpularity"]), df_pca["Pawpularity"], test_size=0.2, random_state=42)

# Define XGBoost model
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    tree_method="hist",  # Optimize memory usage
    enable_categorical=False,
    random_state=42
)

# Train the XGBoost model using PCA-transformed features
xgb_model.fit(X_train_pca, y_train)
y_pred_pca = xgb_model.predict(X_test_pca)

# Evaluate model performance
r2_pca = r2_score(y_test, y_pred_pca)
print(f"PCA applied XGBoost R2 Score {r2_pca:.4f}")

PCA applied XGBoost R2 Score 0.1890


In [3]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("cnn_features_combined.csv")
df = df.drop(columns=["Id"], errors="ignore")

target_column = "Pawpularity"  # Target Variable
X = df.drop(columns=[target_column])  # Independent Variable (CNN Vectors + Metadata)
y = df[target_column]  # Target Variable (Pawpularity Score)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(
    n_estimators=100,  
    learning_rate=0.1,  
    max_depth=5,  
    random_state=42
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost - Mean Squared Error (MSE): {mse_xgb:.4f}")
print(f"XGBoost - R² Score: {r2_xgb:.4f}")

XGBoost - Mean Squared Error (MSE): 357.7013
XGBoost - R² Score: 0.1908


In [7]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, r2_score

# Select top 100 features based on correlation with the target variable
selector = SelectKBest(f_regression, k=100)
X_selected = selector.fit_transform(X_train, y_train)

# Train XGBoost using the selected features
xgb_model.fit(X_selected, y_train)
y_pred_selected = xgb_model.predict(selector.transform(X_test))

# Evaluate model performance
mse_selected = mean_squared_error(y_test, y_pred_selected)
r2_selected = r2_score(y_test, y_pred_selected)

print(f"Feature Selection applied XGBoost - MSE: {mse_selected:.4f}")
print(f"Feature Selection applied XGBoost R2 Score {r2_selected:.4f}")

Feature Selection applied XGBoost - MSE: 366.3011
Feature Selection applied XGBoost R2 Score 0.1713
