In [21]:
# Import necessary libraries
import pandas as pd # for data manipulation
import numpy as np # for numerical operations 
from sklearn.model_selection import train_test_split, GridSearchCV # for splitting data and huperparameter tuning
from sklearn.linear_model import LinearRegression, Lasso # for analysing data
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler # for model evaluation
from sklearn.impute import KNNImputer # Import KNNImputer for handling missing values
from xgboost import XGBRegressor
import plotly.express as px # for creating interactive plots
import plotly.graph_objects as go # for customizing plots
from plotly.subplots import make_subplots # for creating subplots
import joblib # for saving models
import warnings # to suppress warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


In [22]:
# Load the dataset
try:
    amazon_df = pd.read_csv(
        "/Users/ifeomaigbokwe/Desktop/NEXFORD MSC/BAN 6800/customer_segmentation-dataset/cleaned_amazon_15000.csv"
    )
    print("File loaded successfully!")
except Exception as e:
    print("❌ Error loading file:", e)


File loaded successfully!


In [None]:
# Handle missing numeric data using KNN Imputer
# Select only the numerical columns from the DataFrame
num_cols = amazon_df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
    # If there are numerical columns:
    # Create a KNNImputer object with 5 neighbors
    imputer = KNNImputer(n_neighbors=5)
    amazon_df[num_cols] = imputer.fit_transform(amazon_df[num_cols])


In [None]:
# Create additional features to help the model
# Create a new feature 'Price_per_Review' by dividing the price by the number of reviews plus one
amazon_df["Price_per_Review"] = amazon_df["price"] / (amazon_df["reviews"] + 1)
amazon_df["Log_Price"] = np.log1p(amazon_df["price"])


In [25]:
# Define independent features (X) and the target variable (y)
X = amazon_df.drop(columns=["price"])
y = amazon_df["price"]


In [26]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [27]:
# Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [28]:
# Function to evaluate model performance
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} MAE: {mae:.4f}")
    print(f"{model_name} R2: {r2:.4f}")
    return mae, r2, y_pred


In [29]:
# Train and evaluate a Multiple Linear Regression (MLR) model
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)
mlr_mae, mlr_r2, mlr_y_pred = evaluate_model(mlr_model, X_test, y_test, "MLR")


MLR MAE: 0.0139
MLR R2: 0.9982


In [30]:
# Train and evaluate a Lasso Regression model
lasso_model = Lasso(alpha=0.1, max_iter=10000)
lasso_model.fit(X_train, y_train)
lasso_mae, lasso_r2, lasso_y_pred = evaluate_model(lasso_model, X_test, y_test, "Lasso")


Lasso MAE: 0.0334
Lasso R2: 0.9869


In [31]:
# Convert data to float32 for XGBoost
X_train_np = X_train.astype(np.float32)
X_test_np = X_test.astype(np.float32)
y_train_np = y_train.values.astype(np.float32)
y_test_np = y_test.values.astype(np.float32)

# Train and evaluate an XGBoost Regressor
xgb_model = XGBRegressor()
xgb_model.fit(X_train_np, y_train_np)
xgb_mae, xgb_r2, xgb_y_pred = evaluate_model(xgb_model, X_test_np, y_test_np, "XGBoost")


XGBoost MAE: 0.0295
XGBoost R2: 0.6833


In [32]:
# Scatter plot: Predicted vs Actual prices for XGBoost
plot_df = pd.DataFrame({
    'Actual Price': y_test.to_numpy().flatten(),
    'Predicted Price': xgb_y_pred.flatten() if hasattr(xgb_y_pred, 'flatten') else xgb_y_pred
})
fig1 = px.scatter(plot_df, x='Actual Price', y='Predicted Price', title="Predicted vs Actual Prices (XGBoost)")
fig1.add_trace(go.Scatter(x=plot_df['Actual Price'], y=plot_df['Actual Price'], mode="lines", line=dict(color="red"), name="Perfect Prediction"))
fig1.show()


In [33]:
# Bar chart: Feature Importance from XGBoost model
feature_importance = pd.DataFrame({
    "Feature": X.columns.tolist(),
    "Importance": xgb_model.feature_importances_
}).sort_values(by="Importance", ascending=False)
fig2 = px.bar(feature_importance, x="Importance", y="Feature", title="Feature Importance (XGBoost)")
fig2.show()


In [34]:
# Histogram: Distribution of prediction errors
errors = (xgb_y_pred - y_test_np).flatten()
errors_df = pd.DataFrame({'Errors': errors})
fig3 = px.histogram(errors_df, x='Errors', nbins=50, title="Distribution of Prediction Errors (XGBoost)")
fig3.show()


In [35]:
# Line chart: Performance metrics over time
time_points = ["Now"]
mae_values = [xgb_mae]
r2_values = [xgb_r2]

fig4 = make_subplots(rows=2, cols=1, subplot_titles=("MAE Over Time", "R2 Over Time"))
fig4.add_trace(go.Scatter(x=time_points, y=mae_values, mode="lines+markers", name="MAE"), row=1, col=1)
fig4.add_trace(go.Scatter(x=time_points, y=r2_values, mode="lines+markers", name="R2"), row=2, col=1)
fig4.show()


In [36]:
# Bar charts: Compare MAE and R2 across models
fig5 = px.bar(x=["MLR", "Lasso", "XGBoost"], y=[mlr_mae, lasso_mae, xgb_mae], title="MAE Comparison", labels={"x": "Model", "y": "MAE"})
fig5.show()

fig6 = px.bar(x=["MLR", "Lasso", "XGBoost"], y=[mlr_r2, lasso_r2, xgb_r2], title="R2 Comparison", labels={"x": "Model", "y": "R2"})
fig6.show()


In [37]:
# Save model evaluation results to CSV
results_df = pd.DataFrame({
    "Model": ["MLR", "Lasso", "XGBoost"],
    "MAE": [mlr_mae, lasso_mae, xgb_mae],
    "R2": [mlr_r2, lasso_r2, xgb_r2]
})
results_df.to_csv("model_evaluation_results.csv", index=False)
print("Model evaluation results saved to CSV.")


Model evaluation results saved to CSV.


In [38]:
# Save trained models to disk for later use
joblib.dump(mlr_model, "mlr_model.joblib")
joblib.dump(lasso_model, "lasso_model.joblib")
joblib.dump(xgb_model, "xgb_model.joblib")
print("Models saved successfully.")


Models saved successfully.
