In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from collections import Counter
import plotly.express as px
from sklearn.cluster import DBSCAN
import xgboost as xgb
import lightgbm as lgb


In [2]:
online_retail_cleaned_no_cancellations=pd.read_csv("C:\Desktop\Sales, Pricing, and Customer Behaviour Analysis\online_retail_cleaned_no_cancellations.csv")

In [3]:
online_retail_cleaned_no_cancellations=pd.read_csv("cleaned_online_retail_data_no_cancellations_no_outliers_for_clustering.csv")


In [7]:
online_retail_cleaned_no_cancellations.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice(£),CustomerID,Country,Sales(£),InvoiceMonth,InvoiceDay,InvoiceHour,InvoiceWeekday,is_canceled
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3,12,1,8,2,False
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,12,1,8,2,False
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0,12,1,8,2,False
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,12,1,8,2,False
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,12,1,8,2,False


## Step 1: Feature Engineering

### 1.3 Lagged Averages for Price and Quantity Trends


In [None]:
""""Detecting price and quantity trends, to compute the lagged averages for UnitPrice(£) and Quantity per product."""

In [4]:
# Lagged average for Quantity and UnitPrice for each product (StockCode)
online_retail_cleaned_no_cancellations['Quantity_Lagged_Avg'] = online_retail_cleaned_no_cancellations.groupby('StockCode')['Quantity'].transform(lambda x: x.shift(1).rolling(window=3).mean())
online_retail_cleaned_no_cancellations['UnitPrice_Lagged_Avg'] = online_retail_cleaned_no_cancellations.groupby('StockCode')['UnitPrice(£)'].transform(lambda x: x.shift(1).rolling(window=3).mean())

print(online_retail_cleaned_no_cancellations[['StockCode', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']].head())


  StockCode  Quantity_Lagged_Avg  UnitPrice_Lagged_Avg
0    85123A                  NaN                   NaN
1     71053                  NaN                   NaN
2    84406B                  NaN                   NaN
3    84029G                  NaN                   NaN
4    84029E                  NaN                   NaN


In [5]:
print(online_retail_cleaned_no_cancellations[['StockCode', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])


       StockCode  Quantity_Lagged_Avg  UnitPrice_Lagged_Avg
0         85123A                  NaN                   NaN
1          71053                  NaN                   NaN
2         84406B                  NaN                   NaN
3         84029G                  NaN                   NaN
4         84029E                  NaN                   NaN
...          ...                  ...                   ...
324742     22613             5.666667                  0.85
324743     22899             3.666667                  2.10
324744     23254             2.666667                  4.15
324745     23255             3.333333                  4.15
324746     22138             2.333333                  4.95

[324747 rows x 3 columns]


In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


# Step 1: Convert 'InvoiceDate' to datetime
online_retail_cleaned_no_cancellations['InvoiceDate'] = pd.to_datetime(online_retail_cleaned_no_cancellations['InvoiceDate'])

# Step 2: Extract time-related features
online_retail_cleaned_no_cancellations['Hour'] = online_retail_cleaned_no_cancellations['InvoiceDate'].dt.hour
online_retail_cleaned_no_cancellations['Day'] = online_retail_cleaned_no_cancellations['InvoiceDate'].dt.dayofweek  # 0 = Monday, 6 = Sunday
online_retail_cleaned_no_cancellations['Month'] = online_retail_cleaned_no_cancellations['InvoiceDate'].dt.month

# Step 3: Encode 'StockCode'(Label Encoding or One-Hot Encoding)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
# Encode 'StockCode'
online_retail_cleaned_no_cancellations['StockCode_Encoded'] = label_encoder.fit_transform(online_retail_cleaned_no_cancellations['StockCode'])



In [7]:
from sklearn.preprocessing import LabelEncoder
import joblib

#label_encoder_stockcode = LabelEncoder()
#online_retail_cleaned_no_cancellations['StockCode_Encoded'] = label_encoder_stockcode.fit_transform(online_retail_cleaned_no_cancellations['StockCode'])


## Trail


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from scipy.stats import zscore

# Step 1: Handle Outliers
# Detect outliers using Z-score method
z_scores = np.abs(zscore(online_retail_cleaned_no_cancellations[['Quantity', 'UnitPrice(£)', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']]))
outliers = (z_scores > 3).any(axis=1)
cleaned_data = online_retail_cleaned_no_cancellations[~outliers]

# Step 2: Handle Missing Values
# Impute missing values using SimpleImputer with median strategy
imputer = SimpleImputer(strategy='median')
cleaned_data_imputed = cleaned_data.copy()

# Apply imputation to the relevant features
cleaned_data_imputed[['Quantity', 'UnitPrice(£)', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = imputer.fit_transform(
    cleaned_data[['Quantity', 'UnitPrice(£)', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])

# Step 3: Feature Scaling
# Use StandardScaler to scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(cleaned_data_imputed[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])

# Replace the original features with scaled ones
cleaned_data_imputed[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = scaled_features

# Step 4: Create Interaction Features
cleaned_data_imputed['Quantity_UnitPrice_Lagged_Avg'] = cleaned_data_imputed['Quantity'] * cleaned_data_imputed['UnitPrice_Lagged_Avg']

# Step 5: Encode 'StockCode'
label_encoder = LabelEncoder()
cleaned_data_imputed['StockCode_Encoded'] = label_encoder.fit_transform(cleaned_data_imputed['StockCode'])

# Prepare the input features (X) and target variable (y)
X = cleaned_data_imputed[['StockCode_Encoded', 'Quantity', 'Hour', 'Day', 'Month', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg', 'Quantity_UnitPrice_Lagged_Avg']]
y = cleaned_data_imputed['UnitPrice(£)']

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest with feature engineering:")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"R-squared (R²): {r2_rf:.4f}")

# Step 9: Train the Linear Regression model (for comparison)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Step 10: Evaluate the Linear Regression model
y_pred_lr = lr_model.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"\nLinear Regression with feature engineering:")
print(f"Mean Absolute Error (MAE): {mae_lr:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"R-squared (R²): {r2_lr:.4f}")



Random Forest with feature engineering:
Mean Absolute Error (MAE): 0.0489
Root Mean Squared Error (RMSE): 0.2313
R-squared (R²): 0.9736

Linear Regression with feature engineering:
Mean Absolute Error (MAE): 0.1244
Root Mean Squared Error (RMSE): 0.3602
R-squared (R²): 0.9359


In [9]:
# Step 11: Predict Unit Price Function
def predict_unit_price(stock_code, invoice_weekday, invoice_hour, invoice_month, historical_data, rf_model, scaler):
    """
    Predicts the unit price of a stock dynamically based on customer inputs.

    Args:
        stock_code (int): The encoded stock code.
        invoice_weekday (int): Day of the week (0 = Monday, ..., 6 = Sunday).
        invoice_hour (int): Hour of the day in 24-hour format.
        invoice_month (int): Month of the year (1 = January, ..., 12 = December).
        historical_data (DataFrame): Historical data for lagged averages and scaling.
        rf_model (RandomForestRegressor): Trained model.
        scaler (StandardScaler): Fitted scaler for scaling numeric features.
        
    Returns:
        float: Predicted unit price (£).
    """
    # Filter data for the given stock code
    stock_data = historical_data[historical_data['StockCode_Encoded'] == stock_code]
    if stock_data.empty:
        raise ValueError(f"No historical data found for StockCode {stock_code}")

    # Calculate lagged averages
    quantity_lagged_avg = stock_data['Quantity_Lagged_Avg'].mean()
    unitprice_lagged_avg = stock_data['UnitPrice_Lagged_Avg'].mean()
    quantity_unitprice_interaction = quantity_lagged_avg * unitprice_lagged_avg

    # Create input feature
    input_features = pd.DataFrame({
        'StockCode_Encoded': [stock_code],
        'Quantity': [quantity_lagged_avg],
        'Hour': [invoice_hour],
        'Day': [invoice_weekday],
        'Month': [invoice_month],
        'Quantity_Lagged_Avg': [quantity_lagged_avg],
        'UnitPrice_Lagged_Avg': [unitprice_lagged_avg],
        'Quantity_UnitPrice_Lagged_Avg': [quantity_unitprice_interaction]
    })

    # Scale numeric features
    scaled_features = scaler.transform(input_features[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])
    input_features[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = scaled_features

    # Predict using the model
    predicted_price = rf_model.predict(input_features)
    return predicted_price[0]



In [10]:
# Example Prediction
stock_code_encoded = label_encoder.transform(['85131D'])[0]  #'85123A' with your desired stock code
predicted_price = predict_unit_price(
    stock_code=stock_code_encoded,
    invoice_weekday=3,  # Wednesday
    invoice_hour=17,    # 5 PM
    invoice_month=12,   # December
    historical_data=cleaned_data_imputed,
    rf_model=rf_model,
    scaler=scaler
)
print(f"\nPredicted Unit Price for StockCode '85131D': £{predicted_price:.2f}")



Predicted Unit Price for StockCode '85131D': £0.14


## Step 2: Price Prediction Model


### With Feature Engineering

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# Step 1: Handle Outliers
# Detect outliers using Z-score method

z_scores = np.abs(zscore(online_retail_cleaned_no_cancellations[['Quantity', 'UnitPrice(£)', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']]))
outliers = (z_scores > 3).any(axis=1)
cleaned_data = online_retail_cleaned_no_cancellations[~outliers]

# Step 2: Handle Missing Values
# Impute missing values using SimpleImputer with median strategy
imputer = SimpleImputer(strategy='median')  # You can also try 'mean' or 'most_frequent'
cleaned_data_imputed = cleaned_data.copy()

# Apply imputation to the relevant features
cleaned_data_imputed[['Quantity', 'UnitPrice(£)', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = imputer.fit_transform(
    cleaned_data[['Quantity', 'UnitPrice(£)', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])

# Step 3: Feature Scaling
# Use StandardScaler to scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(cleaned_data_imputed[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])

# Replace the original features with scaled ones
cleaned_data_imputed[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = scaled_features

# Step 4: Create Interaction Features
# Create a new feature as the interaction between Quantity and UnitPrice_Lagged_Avg
cleaned_data_imputed['Quantity_UnitPrice_Lagged_Avg'] = cleaned_data_imputed['Quantity'] * cleaned_data_imputed['UnitPrice_Lagged_Avg']

# Prepare the input features (X) and target variable (y)
X = cleaned_data_imputed[['StockCode_Encoded', 'Quantity', 'InvoiceHour', 'InvoiceWeekday', 'InvoiceMonth', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg', 'Quantity_UnitPrice_Lagged_Avg']]
y = cleaned_data_imputed['UnitPrice(£)']

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 7: Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest with feature engineering:")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
print(f"R-squared (R²): {r2_rf:.4f}")

# Step 8: Train the Linear Regression model (for comparison)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Step 9: Evaluate the Linear Regression model
y_pred_lr = lr_model.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"\nLinear Regression with feature engineering:")
print(f"Mean Absolute Error (MAE): {mae_lr:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"R-squared (R²): {r2_lr:.4f}")


Random Forest with feature engineering:
Mean Absolute Error (MAE): 0.0489
Root Mean Squared Error (RMSE): 0.2312
R-squared (R²): 0.9736

Linear Regression with feature engineering:
Mean Absolute Error (MAE): 0.1244
Root Mean Squared Error (RMSE): 0.3602
R-squared (R²): 0.9359


In [12]:
def predict_unit_price(stock_code, invoice_weekday, invoice_hour, invoice_month, historical_data, rf_model, scaler):
    """
    Predicts the unit price of a stock dynamically based on customer inputs.

    Args:
        stock_code (str): The stock code (e.g., '85123A').
        invoice_weekday (int): Day of the week (0 = Monday, ..., 6 = Sunday).
        invoice_hour (int): Hour of the day in 24-hour format.
        invoice_month (int): Month of the year (1 = January, ..., 12 = December).
        historical_data (DataFrame): Historical data for lagged averages and scaling.
        rf_model (RandomForestRegressor): Trained model.
        scaler (StandardScaler): Fitted scaler for scaling numeric features.
        
    Returns:
        float: Predicted unit price (£).
    """
    # Filter data for the given stock code
    stock_data = historical_data[historical_data['StockCode_Encoded'] == stock_code]
    if stock_data.empty:
        raise ValueError(f"No historical data found for StockCode {stock_code}")

    # Calculate lagged averages
    quantity_lagged_avg = stock_data['Quantity_Lagged_Avg'].mean()
    unitprice_lagged_avg = stock_data['UnitPrice_Lagged_Avg'].mean()
    quantity_unitprice_interaction = quantity_lagged_avg * unitprice_lagged_avg

    # Create input feature
    input_features = pd.DataFrame({
        'StockCode_Encoded': [stock_code],
        'Quantity': [quantity_lagged_avg],
        'InvoiceHour': [invoice_hour],
        'InvoiceWeekday': [invoice_weekday],
        'InvoiceMonth': [invoice_month],
        'Quantity_Lagged_Avg': [quantity_lagged_avg],
        'UnitPrice_Lagged_Avg': [unitprice_lagged_avg],
        'Quantity_UnitPrice_Lagged_Avg': [quantity_unitprice_interaction]
    })

    # Scale numeric features
    scaled_features = scaler.transform(input_features[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])
    input_features[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = scaled_features

    # Predict using the model
    predicted_price = rf_model.predict(input_features)
    return predicted_price[0]


#### Saving the trained Model

In [None]:
#import joblib

# Save the trained RandomForest model, scaler, and label encoder
#joblib.dump(rf_model, 'random_forest_model.pkl')
#joblib.dump(scaler, 'scaler.pkl')
#joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder_stockcode.pkl']

## 3. Build the Streamlit App:


In [53]:
rf_model = joblib.load('C:\Desktop\Sales, Pricing, and Customer Behaviour Analysis\scaler.pkl')


In [None]:
import streamlit as st
import pandas as pd
import joblib
import numpy as np
# Load the trained model, scaler, and label encoder
#rf_model = joblib.load('random_forest_model.pkl')
#scaler = joblib.load('scaler.pkl')
#label_encoder = joblib.load('label_encoder.pkl')

# Title of the web app
st.title('Unit Price Prediction for Stock')

# Create input fields for the user
stock_code = st.text_input('Enter Stock Code (e.g., "85123A")', '85123A')
invoice_weekday = st.selectbox('Select Invoice Weekday', ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
invoice_hour = st.slider('Select Invoice Hour', 0, 23, 15)  # Default to 3 PM (15:00)
invoice_month = st.slider('Select Invoice Month', 1, 12, 11)  # Default to November

# Mapping weekdays to integers (0=Monday, 6=Sunday)
weekday_mapping = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
    'Friday': 4, 'Saturday': 5, 'Sunday': 6
}
invoice_weekday_int = weekday_mapping[invoice_weekday]

# When the user presses the button to predict
if st.button('Predict Unit Price'):
    # Encode the stock code
    stock_code_encoded = label_encoder.transform([stock_code])[0]

    # Create the input features (replace lagged values with actual user input if available)
    input_features = pd.DataFrame({
        'StockCode_Encoded': [stock_code_encoded],
        'Quantity': [0],  # You can replace this with actual input from the user
        'InvoiceHour': [invoice_hour],
        'InvoiceWeekday': [invoice_weekday_int],
        'InvoiceMonth': [invoice_month],
        'Quantity_Lagged_Avg': [0],  # Replace with actual input if available
        'UnitPrice_Lagged_Avg': [0],  # Replace with actual input if available
        'Quantity_UnitPrice_Lagged_Avg': [0]  # Replace with actual input if available
    })

    # Scale the features
    scaled_features = scaler.transform(input_features[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']])
    input_features[['Quantity', 'Quantity_Lagged_Avg', 'UnitPrice_Lagged_Avg']] = scaled_features

    # Make prediction
    predicted_price = rf_model.predict(input_features)

    # Display the predicted price to the user
    st.write(f"Predicted Unit Price for StockCode {stock_code} on {invoice_weekday} at {invoice_hour}:00 is: £{predicted_price[0]:.2f}")

