In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import re
import sklearn
import scipy as sp
import plotly.graph_objects as go
from prophet import Prophet
from plotly.subplots import make_subplots
from sklearn import datasets, model_selection, metrics
from scipy.stats import ttest_ind, boxcox, pearsonr
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from fbprophet import Prophet

print("Libraries successfully imported!")


### Understanding the Data

* Invoice id: Unique identifier for each transaction
* Branch: Identifies the branch (A, B, C)
* City: Location of the branches
* Customer type: Member or Normal
* Gender: Gender of the customer
* Product line: Category of the product
* Unit price: Price per unit of the product
* Quantity: Number of units purchased
* Tax: 5% tax on the total price
* Total: Total price including tax
* Date: Date of purchase
* Time: Time of purchase
* Payment: Payment method used
* COGS: Cost of goods sold
* Gross margin percentage: Gross margin percentage
* Gross income: Gross income
* Rating: Customer rating

#### Loading and preparing the data


In [None]:
#import dataset
df = pd.read_csv("../df_cleaned.csv")
df.head()

: 

### clean the data

* Convert the date column to datetime format.
* Extract year, month, and day from the date column.to use these as features in our regression models.
* Aggregate sales data by date. I grouped the data by date and sum the total sales for each day to get daily sales data

In [None]:
# Convert the date column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Extract year, month, and day from the date column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Aggregate sales data by date
gross_income_by_date = df.groupby('date')['gross_income'].sum().reset_index()

# Extract year, month, and day from the date column
gross_income_by_date['year'] = gross_income_by_date['date'].dt.year
gross_income_by_date['month'] = gross_income_by_date['date'].dt.month
gross_income_by_date['day'] = gross_income_by_date['date'].dt.day

# Display the first few rows of the aggregated sales data
gross_income_by_date.head()

: 

In [None]:
df.info()

: 

## Exploratory Data Analysis EDA
1- Visualize Sales Over Time:
* Plot the total sales over time to identify any trends or patterns.

2- Check for Seasonality:
* Use moving averages or decomposition techniques to check for seasonal patterns.

In [None]:
#  plot against all other numeric columns)
numeric_columns = df.select_dtypes(include=['number']).columns.tolist()
numeric_columns.remove('gross_income')  

# Create a pairplot for 'gross_income' against all other numeric columns
sns.pairplot(df, y_vars=['gross_income'], x_vars=numeric_columns, kind='scatter', height=3)
plt.show()

: 

****
1. **Unit Price vs. Gross Income**:

There is a clear positive relationship, suggesting that as unit price increases, gross income also tends to increase.
This variable could be a strong predictor for a regression model focusing on revenue or profitability.

2. **Quantity vs. Gross Income**:

The relationship shows distinct levels of gross income increasing with quantity, though the steps are discrete (likely due to distinct quantity values in sales transactions).
It suggests quantity is a good predictor for gross income, with more items sold typically increasing total gross income.

3. **Tax 5% vs. Gross Income**:

There is a perfect linear relationship, as expected, because tax is a percentage of the total or gross sales, indicating that this is not an independent predictor but rather a direct calculation from the gross income.

4. **Total vs. Gross Income**:

Another perfect linear relationship, indicating that total sales directly influence gross income (gross income is likely derived directly from total sales values).

5. **COGS (Cost of Goods Sold) vs. Gross Income**:

This shows a linear relationship as well, highlighting that as COGS increases, gross income also increases. This could imply that higher-cost items or more items (higher COGS) are driving higher income, which could be a critical factor in profitability analysis.

6. **Gross Margin Percentage vs. Gross Income**:

There is a high concentration of data points at a specific gross margin percentage value, showing very little variation in gross margin percentage across different levels of gross income. This suggests that the gross margin percentage is relatively stable or set by policy/pricing strategy and might not be a dynamic predictor in the model.

7. **Rating vs. Gross Income**:

The relationship is scattered with no clear trend between customer ratings and gross income, indicating that customer satisfaction as measured by ratings does not directly correlate to the gross income generated.
It might not be a significant predictor of gross income.

In [None]:
import matplotlib.pyplot as plt

# Plot the total sales over time
plt.figure(figsize=(8, 4))
plt.plot(gross_income_by_date['date'], gross_income_by_date['gross_income'], marker='o')
plt.title('Gross Income Over Time')
plt.xlabel('Date')
plt.ylabel('Gross Income')
plt.grid(True)
plt.show()

: 

### Next Steps in EDA
1- Seasonal Decomposition:
* Perform a seasonal decomposition to understand the trend, seasonality, and residual components.

2- Moving Average:
* Plot a moving average to smooth out short-term fluctuations and highlight longer-term trends.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Perform seasonal decomposition
decomposition = seasonal_decompose(gross_income_by_date['gross_income'], model='additive', period=30)

# Plot the decomposition results
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(8, 6), sharex=True)

ax1.plot(decomposition.observed)
ax1.set_ylabel('Observed')
ax1.set_title('Seasonal Decomposition')

ax2.plot(decomposition.trend)
ax2.set_ylabel('Trend')

ax3.plot(decomposition.seasonal)
ax3.set_ylabel('Seasonal')

ax4.plot(decomposition.resid)
ax4.set_ylabel('Residual')
ax4.set_xlabel('Date')

plt.show()

: 

##### The seasonal decomposition plot shows the observed data, trend, seasonal component, and residuals. Here's a summary of the components:
- **Observed:** The original Gross income data.
- **Trend:** The overall direction of the data over time.
- **Seasonal:** Repeating patterns at a fixed period (monthly in this case).
- **Residual:** The remaining variations after removing the trend and seasonal components.

### Feature engineering
1. Gender and city
2. Productline type gender
3. Customer type

### Model Development 
 #### Model 1-  Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare the data
X = gross_income_by_date[['year', 'month', 'day']]
y = gross_income_by_date['gross_income']

# Split the data into training and test sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Train the Decision Tree Regressor
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

# Make predictions
dt_predictions = dt_model.predict(X_test)

# Calculate RMSE
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_predictions))
dt_rmse

: 

**Decision Tree Regressor:**

* RMSE: 87.78
* *The Decision Tree Regressor overfits the data, capturing noise and fluctuations, leading to higher RMSE.

### Model 2: Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Calculate RMSE
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_rmse

: 

**Random Forest Regressor:**

* RMSE: 83.1
* Explanation: The Random Forest Regressor improves performance by averaging multiple trees, reducing overfitting compared to a single decision tree.

### Model 3: Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train the Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100)
gb_model.fit(X_train, y_train)

# Make predictions
gb_predictions = gb_model.predict(X_test)

# Calculate RMSE
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_predictions))
gb_rmse

: 

**Gradient Boosting Regressor:**

* RMSE: 94.92
* Explanation: Gradient Boosting can sometimes overfit if not properly tuned, resulting in higher RMSE in this case.

### Model 4: K-Nearest Neighbors Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Train the K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Make predictions
knn_predictions = knn_model.predict(X_test)

# Calculate RMSE
knn_rmse = np.sqrt(mean_squared_error(y_test, knn_predictions))
knn_rmse

: 

**K-Nearest Neighbors Regressor:** 

* RMSE: 79.58
* Explanation: The K-Nearest Neighbors Regressor provides the best performance by averaging the nearest neighbors, leading to the lowest RMSE.

### Model 5: Prophet

In [None]:

# Prepare the data
gross_income_by_date['ds'] = pd.to_datetime(gross_income_by_date[['year', 'month', 'day']])
gross_income_by_date['y'] = gross_income_by_date['gross_income']

# Select only necessary columns
df_prophet = gross_income_by_date[['ds', 'y']]

# Split the data into training and test sets
train_size = int(len(df_prophet) * 0.8)
train_data = df_prophet[:train_size]
test_data = df_prophet[train_size:]

# Initialize and fit the Prophet model
model = Prophet(daily_seasonality=True)
model.fit(train_data)

# Make predictions
future = model.make_future_dataframe(periods=len(test_data), freq='D')
forecast = model.predict(future)

# Extract predicted values
predictions = forecast['yhat'][train_size:]

# Calculate RMSE
from sklearn.metrics import mean_squared_error
from numpy import sqrt

test_y = test_data['y'].reset_index(drop=True)
prophet_rmse = sqrt(mean_squared_error(test_y, predictions))
prophet_rmse

: 

### Model 6: CatBoost

In [None]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=1000, depth=3, learning_rate=0.1, loss_function='RMSE', verbose=False)

# Train model
model.fit(X_train, y_train) 

# Make predictions
predictions = model.predict(X_test)

# Calculate RMSE
catboost_rmse = sqrt(mean_squared_error(y_test, predictions))
catboost_rmse

: 

### Model 7: LightGBM

In [None]:
from lightgbm import LGBMRegressor

# Assuming X_train, X_test, y_train, y_test are already defined and appropriate for model training

# Initialize the LightGBM Regressor
lgb_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, num_leaves=31)

# Train the LightGBM Regressor
lgb_model.fit(X_train, y_train)

# Make predictions
lgb_predictions = lgb_model.predict(X_test)

# Calculate RMSE
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_predictions))
lgb_rmse

: 

### Model 8: LSTM (Long Short-Term Memory) Networks

: 

### Model 9: XGBoost

: 

All results

In [None]:
# Create a DataFrame with RMSE values
df = pd.DataFrame({
    "Model": ["Decision Tree", "Random Forest", "Gradient Boosting", "KNN", "Prophet", "CatBoost", "LightGBM"],
    "RMSE": [dt_rmse, rf_rmse, gb_rmse, knn_rmse, prophet_rmse, catboost_rmse,lgb_rmse]
})

# Print the DataFrame
print(df)

: 

### Model Evaluation
* Decision Tree Regressor RMSE: 87.78
* Random Forest Regressor RMSE: 83.1
* Gradient Boosting Regressor RMSE: 94.92
* K-Nearest Neighbors Regressor RMSE: 79.58

### Hyperparameter Tuning
#### Grid Search


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Set up Grid Search
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_rmse = np.sqrt(-grid_search.best_score_)
best_params, best_rmse

: 

#### Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution for Random Forest
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Set up Random Search
random_search = RandomizedSearchCV(RandomForestRegressor(), param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Best parameters and model
best_random_params = random_search.best_params_
best_random_model = random_search.best_estimator_
best_random_rmse = np.sqrt(-random_search.best_score_)
best_random_params, best_random_rmse

: 

### Model Evaluation
* Decision Tree Regressor RMSE: 87.78
* Random Forest Regressor RMSE: 83.1
* Gradient Boosting Regressor RMSE: 94.92
* K-Nearest Neighbors Regressor RMSE: 79.58

From the RMSE values, we can see that the K-Nearest Neighbors (KNN) Regressor has the lowest RMSE (79.58), indicating the best performance among the models evaluated. Therefore, the KNN Regressor is the best model to use for forecasting in this project.

### Forecasting with Best Model KNN

#### Forecast the next 7 days using the K-Nearest Neighbors Regressor


In [None]:
last_date = gross_income_by_date['date'].max()
forecast_dates = pd.date_range(start=last_date, periods=8, inclusive='right')
forecast_features = pd.DataFrame({
    'date': forecast_dates,
    'year': forecast_dates.year,
    'month': forecast_dates.month,
    'day': forecast_dates.day
})
forecast_features['gross_income'] = knn_model.predict(forecast_features[['year', 'month', 'day']])
forecast_features[['date', 'gross_income']]

: 

### **CONCLUSION**

* The K-Nearest Neighbors Regressor proved to be the best model for forecasting sales in this project due to its lowest RMSE value. 
* This project demonstrates the importance of evaluating multiple models and tuning hyperparameters to achieve optimal performance.
* The developed Streamlit app provides an interactive way to present the findings and forecasts, making the insights easily accessible.