# Deliverable 2: Regression Modeling and Performance Evaluation

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Load dataset
df = pd.read_excel('Online Retail.xlsx')

# Data Cleaning (same as Deliverable 1)
df = df.dropna(subset=['CustomerID'])
df = df.drop_duplicates(subset=['InvoiceNo', 'StockCode'])
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


## Feature Engineering
Creating new features to improve regression model performance.

In [None]:
# Aggregate data by CustomerID
customer_df = df.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'UnitPrice': 'mean',
    'TotalPrice': 'sum',
    'InvoiceNo': 'nunique'
}).reset_index()

# Rename columns
customer_df.rename(columns={
    'Quantity': 'TotalQuantity',
    'UnitPrice': 'AvgUnitPrice',
    'TotalPrice': 'TotalSpent',
    'InvoiceNo': 'NumPurchases'
}, inplace=True)

# Target variable: TotalSpent (regression target)
X = customer_df[['TotalQuantity', 'AvgUnitPrice', 'NumPurchases']]
y = customer_df['TotalSpent']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Linear Regression Model

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Evaluation metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Results:")
print(f"R-squared: {r2_lr:.4f}")
print(f"MSE: {mse_lr:.4f}")
print(f"RMSE: {rmse_lr:.4f}")


## Ridge Regression Model

In [None]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

# Evaluation metrics
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression Results:")
print(f"R-squared: {r2_ridge:.4f}")
print(f"MSE: {mse_ridge:.4f}")
print(f"RMSE: {rmse_ridge:.4f}")


## Model Comparison and Cross-Validation

In [None]:
# Compare models using cross-validation
cv_scores_lr = cross_val_score(lr, X, y, cv=5, scoring='neg_mean_squared_error')
cv_scores_ridge = cross_val_score(ridge, X, y, cv=5, scoring='neg_mean_squared_error')

print("Cross-Validation Results (MSE):")
print(f"Linear Regression Mean CV MSE: {-cv_scores_lr.mean():.4f}")
print(f"Ridge Regression Mean CV MSE: {-cv_scores_ridge.mean():.4f}")

# Visualization of predictions
plt.figure(figsize=(10,5))
plt.scatter(y_test, y_pred_lr, color='blue', label='Linear Regression')
plt.scatter(y_test, y_pred_ridge, color='red', label='Ridge Regression', alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual TotalSpent')
plt.ylabel('Predicted TotalSpent')
plt.title('Actual vs Predicted: Linear vs Ridge Regression')
plt.legend()
plt.show()


## Insights and Observations
- Both Linear and Ridge Regression models performed well, but Ridge slightly reduced overfitting with similar RMSE.
- Feature engineering helped capture key aspects of customer spending.
- Cross-validation confirmed that Ridge Regression has better generalization on unseen data.
