In [1]:
# Importing basic libraries for data manipulation and analysis
import numpy as np                # For numerical operations
import pandas as pd # For data manipulation and analysis

# Importing libraries for data visualization
import matplotlib.pyplot as plt   # For basic plotting
import seaborn as sns             # For advanced and aesthetically pleasing visualizations

# Importing libraries for statistical analysis
import scipy.stats as stats       # For statistical tests and distributions
from statsmodels.tsa.stattools import adfuller  # For time series analysis (stationarity test)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf  # For autocorrelation and partial autocorrelation plots

# Importing libraries for machine learning (optional, for advanced EDA)
from sklearn.preprocessing import LabelEncoder, StandardScaler  # For preprocessing
from sklearn.model_selection import train_test_split  # For splitting data into train/test sets
from sklearn.decomposition import PCA  # For dimensionality reduction

# Setting up display options
pd.set_option('display.max_columns', None)  # Display all columns in DataFrame
pd.set_option('display.max_rows', 100)  # Set the number of rows to display
sns.set(style="whitegrid")  # Set Seaborn style for plots

# Warnings
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings to keep the output clean


# Training Models (Regression)

In [167]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [168]:
# Step 3: Sort by transaction_date
sales_df = sales_df.sort_values(by=['transaction_date'])
sales_df.set_index('transaction_date',inplace=True)

In [169]:
sales_df.head()

Unnamed: 0_level_0,transaction_id,product_id,product_category,quantity,unit_price,discount_applied,sales,total_sales,store_location,total_transactions,avg_transaction_value,avg_purchase_value,purchase_frequency,avg_items_per_transaction,total_items_purchased,total_returned_items,total_returned_value,total_discounts_received,avg_discount_used,max_single_purchase_value,min_single_purchase_value
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,368952,4276,Furniture,8,507.35,0.01,4058.8,9462.01,Location C,38,342.34,96.66,Daily,3.62,405,3,307.43,303.01,0.19,644.79,8.04
2020-01-01,293896,9720,Groceries,8,451.1,0.22,3608.8,1756.51,Location D,56,153.7,62.59,Daily,5.17,1,6,183.13,754.02,0.38,313.47,6.76
2020-01-01,548574,5719,Groceries,2,631.91,0.1,1263.82,6235.58,Location D,81,23.07,294.21,Weekly,8.71,421,6,1.14,149.86,0.41,787.22,4.22
2020-01-01,454599,1282,Furniture,8,897.81,0.37,7182.48,4417.82,Location C,14,252.99,27.2,Daily,1.23,82,3,867.04,702.53,0.0,100.41,9.14
2020-01-01,539839,8117,Clothing,4,307.83,0.17,1231.32,3019.91,Location B,11,477.41,476.41,Yearly,1.92,228,4,752.21,565.08,0.12,762.55,9.76


In [175]:
sales_df.drop(['transaction_id', 'product_id'], axis = 1)

Unnamed: 0_level_0,product_category,quantity,unit_price,discount_applied,sales,total_sales,store_location,total_transactions,avg_transaction_value,avg_purchase_value,purchase_frequency,avg_items_per_transaction,total_items_purchased,total_returned_items,total_returned_value,total_discounts_received,avg_discount_used,max_single_purchase_value,min_single_purchase_value
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-01-01,Furniture,8,507.35,0.01,4058.80,9462.01,Location C,38,342.34,96.66,Daily,3.62,405,3,307.43,303.01,0.19,644.79,8.04
2020-01-01,Groceries,8,451.10,0.22,3608.80,1756.51,Location D,56,153.70,62.59,Daily,5.17,1,6,183.13,754.02,0.38,313.47,6.76
2020-01-01,Groceries,2,631.91,0.10,1263.82,6235.58,Location D,81,23.07,294.21,Weekly,8.71,421,6,1.14,149.86,0.41,787.22,4.22
2020-01-01,Furniture,8,897.81,0.37,7182.48,4417.82,Location C,14,252.99,27.20,Daily,1.23,82,3,867.04,702.53,0.00,100.41,9.14
2020-01-01,Clothing,4,307.83,0.17,1231.32,3019.91,Location B,11,477.41,476.41,Yearly,1.92,228,4,752.21,565.08,0.12,762.55,9.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31,Furniture,9,263.35,0.13,2370.15,7071.70,Location C,27,287.42,216.81,Monthly,6.10,128,5,2.89,366.69,0.16,714.51,9.89
2021-12-31,Furniture,2,913.00,0.06,1826.00,1292.10,Location A,58,400.63,363.90,Monthly,6.48,425,4,144.08,761.24,0.03,936.53,5.54
2021-12-31,Groceries,9,437.98,0.17,3941.82,2844.66,Location D,79,155.00,362.59,Weekly,2.55,463,7,656.45,358.32,0.49,438.21,4.17
2021-12-31,Groceries,1,32.81,0.00,32.81,4733.69,Location B,72,12.35,80.84,Daily,6.12,344,7,592.19,333.72,0.24,690.71,3.97
