In [None]:
# Required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you already have your `df` dataset from previous steps

# Prepare features and target
X = df.drop(columns=["customer_id", "CLV"])
y = df["CLV"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numeric features for linear model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
def evaluate_model(name, y_true, y_pred):
    return {
        "Model": name,
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R2 Score": r2_score(y_true, y_pred)
    }

results = [
    evaluate_model("Linear Regression", y_test, y_pred_lr),
    evaluate_model("Random Forest", y_test, y_pred_rf)
]

results_df = pd.DataFrame(results)
print(results_df)

# Plot: Actual vs Predicted CLV (Random Forest)
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_rf, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual CLV")
plt.ylabel("Predicted CLV")
plt.title("Random Forest: Actual vs Predicted CLV")
plt.tight_layout()
plt.show()


In [None]:
# Data Preprocessing
# Cross-validation
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Model Training
model = LinearRegression()
model.fit(X_train, y_train)
# Model Prediction
y_pred = model.predict(X_test)
# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
# Polynomial Regression
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, test_size=0.2, random_state=42)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train_poly)
y_pred_poly = poly_model.predict(X_test_poly)
# Model Evaluation for Polynomial Regression
mae_poly = mean_absolute_error(y_test_poly, y_pred_poly)
mse_poly = mean_squared_error(y_test_poly, y_pred_poly)
r2_poly = r2_score(y_test_poly, y_pred_poly)
print(f"Polynomial Regression - Mean Absolute Error: {mae_poly}")
print(f"Polynomial Regression - Mean Squared Error: {mse_poly}")
print(f"Polynomial Regression - R^2 Score: {r2_poly}")
# Visualizing the results
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', label='Predicted vs Actual')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label='Ideal Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()
# Visualizing the residuals
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, y_pred - y_test, color='blue', label='Residuals')
plt.axhline(0, color='red', lw=2, label='Zero Residual Line')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Values')
plt.legend()
plt.show()
# Visualizing the distribution of errors
plt.figure(figsize=(10, 6))
sns.histplot(y_test - y_pred, kde=True, color='blue', label='Error Distribution')
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.title('Distribution of Errors')
plt.legend()
plt.show()
# Visualizing the feature importance
feature_importance = pd.Series(model.coef_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar', color='blue')
plt.title('Feature Importance')
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.show()
# Visualizing the polynomial regression results
plt.figure(figsize=(10, 6))
plt.scatter(y_test_poly, y_pred_poly, color='blue', label='Predicted vs Actual (Polynomial)')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label='Ideal Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values (Polynomial)')
plt.title('Actual vs Predicted Values (Polynomial Regression)')
plt.legend()
plt.show()
# Visualizing the residuals for polynomial regression
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_poly, y_pred_poly - y_test_poly, color='blue', label='Residuals (Polynomial)')
plt.axhline(0, color='red', lw=2, label='Zero Residual Line (Polynomial)')
plt.xlabel('Predicted Values (Polynomial)')
plt.ylabel('Residuals (Polynomial)')
plt.title('Residuals vs Predicted Values (Polynomial Regression)')
plt.legend()
plt.show()
# Visualizing the distribution of errors for polynomial regression
plt.figure(figsize=(10, 6))
sns.histplot(y_test_poly - y_pred_poly, kde=True, color='blue', label='Error Distribution (Polynomial)')
plt.xlabel('Error (Polynomial)')
plt.ylabel('Frequency')
plt.title('Distribution of Errors (Polynomial Regression)')
plt.legend()
# Visualizing the feature importance for polynomial regression
feature_importance_poly = pd.Series(poly_model.coef_, index=poly.get_feature_names_out(X.columns)).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
feature_importance_poly.plot(kind='bar', color='blue')
plt.title('Feature Importance (Polynomial Regression)')
plt.xlabel('Features (Polynomial)')
plt.ylabel('Coefficient Value (Polynomial)')
plt.show()
# Saving the model
import joblib
joblib.dump(model, 'linear_regression_model.pkl')
joblib.dump(poly_model, 'polynomial_regression_model.pkl')
# Loading the model
loaded_model = joblib.load('linear_regression_model.pkl')
loaded_poly_model = joblib.load('polynomial_regression_model.pkl')
# Making predictions with the loaded model
loaded_y_pred = loaded_model.predict(X_test)
loaded_y_pred_poly = loaded_poly_model.predict(X_test_poly)
# Evaluating the loaded model
loaded_mae = mean_absolute_error(y_test, loaded_y_pred)
loaded_mse = mean_squared_error(y_test, loaded_y_pred)
loaded_r2 = r2_score(y_test, loaded_y_pred)
print(f"Loaded Model - Mean Absolute Error: {loaded_mae}")
print(f"Loaded Model - Mean Squared Error: {loaded_mse}")
print(f"Loaded Model - R^2 Score: {loaded_r2}")
loaded_mae_poly = mean_absolute_error(y_test_poly, loaded_y_pred_poly)
loaded_mse_poly = mean_squared_error(y_test_poly, loaded_y_pred_poly)
loaded_r2_poly = r2_score(y_test_poly, loaded_y_pred_poly)
print(f"Loaded Polynomial Model - Mean Absolute Error: {loaded_mae_poly}")
print(f"Loaded Polynomial Model - Mean Squared Error: {loaded_mse_poly}")
print(f"Loaded Polynomial Model - R^2 Score: {loaded_r2_poly}")
# Saving the evaluation metrics
evaluation_metrics = {
    'Linear Regression': {
        'MAE': mae,
        'MSE': mse,
        'R^2': r2
    },
    'Polynomial Regression': {
        'MAE': mae_poly,
        'MSE': mse_poly,
        'R^2': r2_poly
    }
}
import json
with open('evaluation_metrics.json', 'w') as f:
    json.dump(evaluation_metrics, f, indent=4)  
# Loading the evaluation metrics
with open('evaluation_metrics.json', 'r') as f:
    loaded_evaluation_metrics = json.load(f)
print("Loaded Evaluation Metrics:", loaded_evaluation_metrics)
# Saving the preprocessed data
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)
X_train_df.to_csv('X_train_preprocessed.csv', index=False)
X_test_df.to_csv('X_test_preprocessed.csv', index=False)
y_train_df = pd.DataFrame(y_train, columns=['Target (Total orders)'])
y_test_df = pd.DataFrame(y_test, columns=['Target (Total orders)'])
y_train_df.to_csv('y_train_preprocessed.csv', index=False)
y_test_df.to_csv('y_test_preprocessed.csv', index=False)
# Loading the preprocessed data
X_train_loaded = pd.read_csv('X_train_preprocessed.csv')
X_test_loaded = pd.read_csv('X_test_preprocessed.csv')
y_train_loaded = pd.read_csv('y_train_preprocessed.csv')
y_test_loaded = pd.read_csv('y_test_preprocessed.csv')
print("Loaded Preprocessed Data:")
print("X_train shape:", X_train_loaded.shape)
print("X_test shape:", X_test_loaded.shape)
print("y_train shape:", y_train_loaded.shape)
print("y_test shape:", y_test_loaded.shape)
# Saving the scaler
joblib.dump(scaler, 'scaler.pkl')
# Loading the scaler
loaded_scaler = joblib.load('scaler.pkl')
# Making predictions with the loaded scaler
X_test_scaled = loaded_scaler.transform(X_test_loaded)
loaded_y_pred_scaled = loaded_model.predict(X_test_scaled)
loaded_y_pred_poly_scaled = loaded_poly_model.predict(poly.transform(X_test_scaled))
# Evaluating the loaded scaler predictions
loaded_mae_scaled = mean_absolute_error(y_test_loaded, loaded_y_pred_scaled)
loaded_mse_scaled = mean_squared_error(y_test_loaded, loaded_y_pred_scaled)
loaded_r2_scaled = r2_score(y_test_loaded, loaded_y_pred_scaled)
print(f"Loaded Scaler - Mean Absolute Error: {loaded_mae_scaled}")
print(f"Loaded Scaler - Mean Squared Error: {loaded_mse_scaled}")
print(f"Loaded Scaler - R^2 Score: {loaded_r2_scaled}")
loaded_mae_poly_scaled = mean_absolute_error(y_test_loaded, loaded_y_pred_poly_scaled)
loaded_mse_poly_scaled = mean_squared_error(y_test_loaded, loaded_y_pred_poly_scaled)
loaded_r2_poly_scaled = r2_score(y_test_loaded, loaded_y_pred_poly_scaled)
print(f"Loaded Polynomial Scaler - Mean Absolute Error: {loaded_mae_poly_scaled}")
print(f"Loaded Polynomial Scaler - Mean Squared Error: {loaded_mse_poly_scaled}")
print(f"Loaded Polynomial Scaler - R^2 Score: {loaded_r2_poly_scaled}")
# Saving the predictions
predictions = pd.DataFrame({
    'Actual': y_test_loaded['Target (Total orders)'],
    'Predicted (Linear)': loaded_y_pred_scaled,
    'Predicted (Polynomial)': loaded_y_pred_poly_scaled
})
predictions.to_csv('predictions.csv', index=False)
# Loading the predictions
predictions_loaded = pd.read_csv('predictions.csv')
print("Loaded Predictions:")
print(predictions_loaded.head())
# Visualizing the loaded predictions
plt.figure(figsize=(10, 6))
plt.scatter(predictions_loaded['Actual'], predictions_loaded['Predicted (Linear)'], color='blue', label='Predicted vs Actual (Linear)')
plt.plot([predictions_loaded['Actual'].min(), predictions_loaded['Actual'].max()],
         [predictions_loaded['Actual'].min(), predictions_loaded['Actual'].max()],
         color='red', lw=2, label='Ideal Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values (Linear)')
plt.title('Actual vs Predicted Values (Linear Regression)')
plt.legend()
plt.show()
plt.figure(figsize=(10, 6))
plt.scatter(predictions_loaded['Actual'], predictions_loaded['Predicted (Polynomial)'], color='blue', label='Predicted vs Actual (Polynomial)')
plt.plot([predictions_loaded['Actual'].min(), predictions_loaded['Actual'].max()],
         [predictions_loaded['Actual'].min(), predictions_loaded['Actual'].max()],
         color='red', lw=2, label='Ideal Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values (Polynomial)')
plt.title('Actual vs Predicted Values (Polynomial Regression)')
plt.legend()
plt.show()
# Saving the final model and scaler
final_model = {
    'model': model,
    'poly_model': poly_model,
    'scaler': scaler
}
joblib.dump(final_model, 'final_model.pkl')
# Loading the final model and scaler
final_model_loaded = joblib.load('final_model.pkl')
# Making predictions with the final loaded model
final_y_pred = final_model_loaded['model'].predict(X_test_scaled)
final_y_pred_poly = final_model_loaded['poly_model'].predict(poly.transform(X_test_scaled))
# Evaluating the final loaded model
final_mae = mean_absolute_error(y_test_loaded, final_y_pred)
final_mse = mean_squared_error(y_test_loaded, final_y_pred)
final_r2 = r2_score(y_test_loaded, final_y_pred)
print(f"Final Loaded Model - Mean Absolute Error: {final_mae}")
print(f"Final Loaded Model - Mean Squared Error: {final_mse}")
print(f"Final Loaded Model - R^2 Score: {final_r2}")
final_mae_poly = mean_absolute_error(y_test_loaded, final_y_pred_poly)
final_mse_poly = mean_squared_error(y_test_loaded, final_y_pred_poly)
final_r2_poly = r2_score(y_test_loaded, final_y_pred_poly)
print(f"Final Loaded Polynomial Model - Mean Absolute Error: {final_mae_poly}")
print(f"Final Loaded Polynomial Model - Mean Squared Error: {final_mse_poly}")
print(f"Final Loaded Polynomial Model - R^2 Score: {final_r2_poly}")
