In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [28]:
df_model = pd.read_csv("/content/amazon_final_model_dataset.csv")
print(f"‚úÖ Data loaded successfully with shape: {df_model.shape}")

‚úÖ Data loaded successfully with shape: (177008, 25)


In [29]:
if "profit_margin" in df_model.columns:
    df_model_noleak = df_model.drop(columns=["profit_margin"])
else:
    df_model_noleak = df_model.copy()

print("‚úÖ Final clean shape:", df_model_noleak.shape)

‚úÖ Final clean shape: (177008, 24)


In [30]:
target = "amount"

X = df_model_noleak.drop(columns=[target])
y = y = df_model_noleak[target]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale numeric features (for models sensibles)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
try:
    from xgboost import XGBRegressor
    from lightgbm import LGBMRegressor
    HAS_XGB = True
except ImportError:
    HAS_XGB = False
    print("‚ö†Ô∏è XGBoost / LightGBM not installed, skipping those models.")


In [32]:
# MODELS
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1),
    "AdaBoost": AdaBoostRegressor(n_estimators=150, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=150, random_state=42),
    "SVM (RBF)": SVR(kernel="rbf", C=1.0),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5)}

if HAS_XGB:
    models["XGBoost"] = XGBRegressor(n_estimators=150, learning_rate=0.1, random_state=42, n_jobs=-1)
    models["LightGBM"] = LGBMRegressor(n_estimators=150, learning_rate=0.1, random_state=42, n_jobs=-1)


In [33]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)


In [34]:
results = []

for name, model in models.items():
    if "Linear" in name or "SVM" in name or "KNN" in name:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "RMSE": rmse, "MAE": mae, "R2": r2})
    print(f"\nüß† {name}\n   RMSE: {rmse:.3f}\n   MAE : {mae:.3f}\n   R¬≤  : {r2:.3f}")



üß† Linear Regression
   RMSE: 0.863
   MAE : 0.627
   R¬≤  : 0.249

üß† Decision Tree
   RMSE: 0.095
   MAE : 0.039
   R¬≤  : 0.991

üß† Random Forest
   RMSE: 0.016
   MAE : 0.004
   R¬≤  : 1.000

üß† AdaBoost
   RMSE: 0.354
   MAE : 0.238
   R¬≤  : 0.873

üß† Gradient Boosting
   RMSE: 0.090
   MAE : 0.045
   R¬≤  : 0.992

üß† SVM (RBF)
   RMSE: 0.773
   MAE : 0.516
   R¬≤  : 0.398

üß† KNN Regressor
   RMSE: 0.608
   MAE : 0.366
   R¬≤  : 0.627

üß† XGBoost
   RMSE: 0.046
   MAE : 0.018
   R¬≤  : 0.998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 141606, number of used features: 22
[LightGBM] [Info] Start training from score 0.000638

üß† LightGBM
   RMSE: 0.050
   MAE : 0.024
   R¬≤  : 0.997


In [35]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=50,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Random Forest regularised", rmse, "R¬≤:", r2)


Random Forest regularised 0.22215883101388664 R¬≤: 0.9502433956802272


In [None]:
models = results_df["Model"]
r2_scores = results_df["R2"]

# Create bar chart
plt.figure(figsize=(8,5))
bars = plt.bar(models, r2_scores, color=['#A1C9F4', '#76B7B2', '#59A14F', '#EDC948', '#E15759'])

# Add exact R¬≤ labels on top of each bar
for bar, score in zip(bars, r2_scores):
    plt.text(bar.get_x() + bar.get_width()/2, score + 0.02, f"{score:.3f}",
             ha='center', va='bottom', fontsize=10, fontweight='bold')

# Customize chart
plt.title("Model Performance (R¬≤ Comparison)", fontsize=14, fontweight='bold')
plt.ylabel("R¬≤ Score", fontsize=12)
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Display chart
plt.tight_layout()
plt.show()


In [37]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    if "Linear" in name or "SVM" in name or "KNN" in name:
        X_used = X_train_scaled
    else:
        X_used = X_train
    scores = cross_val_score(model, X_used, y_train, cv=5, scoring="r2")
    print(f"{name}: mean R¬≤ = {scores.mean():.3f} ¬± {scores.std():.3f}")


TypeError: argument of type 'int' is not iterable

In [None]:
results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
print("\nüìä Model comparison:")
print(results_df)

plt.figure(figsize=(9, 4))
sns.barplot(x="R2", y="Model", data=results_df, palette="viridis")
plt.title("Model Performance (Higher R¬≤ is Better)")
plt.tight_layout()
plt.show()


In [None]:
best_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
scores = cross_val_score(best_model, X, y, cv=5, scoring="r2")
print(f"‚úÖ RandomForest Cross-Validation mean R¬≤: {scores.mean():.3f} (std: {scores.std():.3f})")

In [None]:
# 6Ô∏èCross-validation (RandomForest)

from sklearn.model_selection import cross_val_score

best_model = RandomForestRegressor(n_estimators=150, random_state=42, n_jobs=-1)
scores = cross_val_score(best_model, X, y, cv=5, scoring="r2")
print(f"\n‚úÖ RandomForest Cross-Validation mean R¬≤: {scores.mean():.3f} (std: {scores.std():.3f})")


In [38]:
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

print("Train R¬≤:", r2_score(y_train, y_pred_train))
print("Test  R¬≤:", r2_score(y_test, y_pred_test))


Train R¬≤: 0.9493014807708381
Test  R¬≤: 0.9502433956802272


In [39]:
results_df.to_csv("model_results.csv", index=False)
