# Supermarket Sales Analysis

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

### Load the data

In [None]:
df = pd.read_csv('supermarket_sales - Sheet1.csv')

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

In [None]:
df.drop("Time", axis=1, inplace=True)
df.set_index('Date',inplace =True)

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Exploratory Data Analysis

In [None]:
# group the data by customer type and calculate the average total amount spent by each type
df_customer_type = df.groupby('Customer type')['Total'].mean()
print(df_customer_type)

# visualize the df_customer_type
df_customer_type.plot(kind='bar', title = 'Average Total Amount Spent By Customer type ');

In [None]:
# group the data by customer type and calculate the average total amount spent by each type
df_genderr = df.groupby('Gender')['Total'].mean()
print(df_genderr)

# visualize the df_customer_type
df_genderr.plot(kind='bar', title = 'Average Total Amount Spent By Gender');

In [None]:
# Visualizing a Gender based comparison related to Product Line
sns.countplot(y ='Product line', hue = "Gender", data = df) 
plt.xlabel('Count')
plt.ylabel('Product Line');

In [None]:
# Visualizing a Gender based comparison related to Product Line
sns.catplot(x='Product line',y='Unit price',hue='Gender',data=df,aspect=3)
plt.xlabel('Product Line')
plt.ylabel('Unit Price');

In [None]:
# calculate the average unit price for each product line
df_product_line_price = df.groupby('Product line')['Unit price'].mean()
df_product_line_price

In [None]:
# visualize the average unit price for each product line
df_product_line_price.plot(kind='bar', title = 'Average Unit Price For Each Product Line');

In [None]:
# calculate the overall gross margin percentage
df['gross_margin'] = (df['Total'] - df['cogs']) / df['Total']
overall_gross_margin = df['gross_margin'].mean()

print('the overall gross margin is', overall_gross_margin)

In [None]:
# calculate the total sales for each city
df_city_sales = df.groupby('City')['Total'].sum()

# create a bar chart to visualize the total sales for each city
df_city_sales.plot(kind='bar')
plt.title('Total Sales by City')
plt.xlabel('City')
plt.ylabel('Total Sales');

In [None]:
# Visualizing a City based comparison related to Product Line
sns.catplot(x='Product line',y='Unit price',hue='City',data=df,aspect=3)
plt.xlabel('Product Line')
plt.ylabel('Unit Price');

In [None]:
# create a pie chart to visualize the distribution of sales across different product lines
df.groupby('Product line')['Total'].sum().plot(kind='bar')
plt.title('Sales by Product Line');

In [None]:
# create a scatter plot to visualize the relationship between unit price and quantity
plt.scatter(df['Unit price'], df['Quantity'])

plt.title('Unit Price vs Quantity')
plt.xlabel('Unit Price')
plt.ylabel('Quantity');

df[["Unit price", "Quantity"]].corr()

We can see a correlation value of **0.010778** indicates a very weak positive correlation between the two variables. This means that there is a very weak relationship between the two variables and as one variable increases, the other variable also increases, but only slightly.

In [None]:
# create a histogram to visualize the distribution of customer ratings
sns.displot(df['Rating'],kde=True);

In [None]:
# calculate the gross margin percentage for each branch
df.groupby('Branch')['gross margin percentage'].mean()

In [None]:
# Finding the most used payment method for Product Line
sns.countplot(y ='Product line', hue = "Payment", data = df) 
plt.xlabel('Count')
plt.ylabel('Product Line');

In [None]:
# Finding the most used payment method for Branch 
sns.countplot(y ='Branch', hue = "Payment", data = df) 
plt.xlabel('Count')
plt.ylabel('Branch');

In [None]:
# # Finding the most used payment method for each City
sns.countplot(y ='City', hue = "Payment", data = df) 
plt.xlabel('Count')
plt.ylabel('Product Line');

**Finding Which Branch has better sale for a particular Product Line**

In [None]:
# Finding Which Branch has better sale for a particular Product Line
sns.countplot(y ='Product line', hue = "Branch", data = df) 
plt.xlabel('Count')
plt.ylabel('Product Line');

In [None]:
plt.figure(dpi=200)
wordcloud = WordCloud(background_color='White').generate(" ".join(df['Product line']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('cast.png');

### Data Modeling

We use K-Means clustering and PCA to segment customers based on their purchasing behavior and visualize the results in two-dimensional space. This type of analysis can be useful for businesses to understand the behavior of their customers and develop targeted marketing strategies.

In [None]:
# import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# create a new DataFrame with only the relevant columns
df_clustering = df[['Customer type', 'Gender', 'Total']]

# apply one-hot encoding to the categorical columns
df_clustering = pd.get_dummies(df_clustering)

# reduce the dimensionality of the data using PCA
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_clustering)

# initialize the K-Means model with 3 clusters
kmeans = KMeans(n_clusters=3, n_init=10)

# fit the model to the data
kmeans.fit(df_pca)

# predict the clusters for each data point
predicted_clusters = kmeans.predict(df_pca)

# add the predicted clusters as a new column to the DataFrame
df_clustering['Segment'] = predicted_clusters

# group the data by segment and calculate the average total amount spent by each segment
df_segment_sales = df_clustering.groupby('Segment')['Total'].mean()
print(df_segment_sales)

# plot the clusters
plt.scatter(df_pca[:,0], df_pca[:,1], c=predicted_clusters)
plt.show()

We then use a Random Forest classification algorithm to predict customer ratings based on other attributes in the dataset. By predicting customer ratings, we can gain insights into the factors that influence customer satisfaction and potentially make improvements to our services to increase satisfaction.

In [None]:
# import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# create a new DataFrame with only the relevant columns
df_classification = df[['Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating']]

# apply one-hot encoding to the categorical columns
df_classification = pd.get_dummies(df_classification)

# convert the continuous target variable into a categorical variable by binning the values into 3 bins
df_classification['Rating'] = pd.cut(df_classification['Rating'], bins=3, labels=[1, 2, 3])

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_classification.drop('Rating', axis=1), df_classification['Rating'], test_size=0.2)

# initialize the Random Forest model
rf = RandomForestClassifier()

# fit the model to the training data
rf.fit(X_train, y_train)

# predict the ratings for the test data
predicted_ratings = rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, predicted_ratings)
print('Accuracy:', accuracy)

# generate a classification report to see precision, recall, f1-score for each class
class_report = classification_report(y_test, predicted_ratings)
print('Classification Report:\n', class_report)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, predicted_ratings)
rmse = mean_squared_error(y_test, predicted_ratings, squared=False)
mae = mean_absolute_error(y_test, predicted_ratings)

print('MSE:', mse)
print('RMSE:', rmse)
print('MAE:', mae)

The Random Forest classification algorithm achieved an accuracy of 0.325, which indicates that the model was not able to accurately predict the customer ratings based on the other attributes in the dataset. Looking at the classification report, we can see that the precision, recall, and f1-score for each class are relatively low, with the highest being 0.41 for precision in class 1. This suggests that the model may not have captured the complex relationships between the attributes and the target variable, and that further data preprocessing and feature engineering may be necessary to improve the model's performance. Additionally, it may be worth considering other classification algorithms or hyperparameter tuning to see if they can achieve better results.

Based on the classification report, we can see that the overall accuracy of the model is low at 0.26. The precision, recall, and f1-score for each class are also quite low, with the highest being 0.39 for class 1. This means that the model is not able to accurately predict the customer ratings based on the other attributes in the dataset.

To improve the performance of the model, we could try using a different classification algorithm, such as SVM or logistic regression, or we could try optimizing the parameters of the Random Forest algorithm using techniques like grid search or random search. Additionally, we could consider adding more features to the dataset or performing feature engineering to better capture the factors that influence customer ratings.

In [None]:
# import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# create a new DataFrame with only the relevant columns
df_classification = df[['Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating']]

# apply one-hot encoding to the categorical columns
df_classification = pd.get_dummies(df_classification)

# convert the continuous target variable into a categorical variable by binning the values into 3 bins
df_classification['Rating'] = pd.cut(df_classification['Rating'], bins=3, labels=[1, 2, 3])

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_classification.drop('Rating', axis=1), df_classification['Rating'], test_size=0.2)

# initialize the Random Forest model
rf = RandomForestClassifier()

# define the hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# print the best hyperparameters found
print('Best hyperparameters:', grid_search.best_params_)

# predict the ratings for the test data using the best hyperparameters
best_rf = grid_search.best_estimator_
predicted_ratings = best_rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = best_rf.score(X_test, y_test)
print('Accuracy:', accuracy)

# print a classification report
print(classification_report(y_test, predicted_ratings))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, predicted_ratings)
rmse = mean_squared_error(y_test, predicted_ratings, squared=False)
mae = mean_absolute_error(y_test, predicted_ratings)

print('MSE:', mse)
print('RMSE:', rmse)
print('MAE:', mae)

The model's performance has improved slightly, with an increase in accuracy. However, the precision, recall, and F1-scores for each class are still relatively low, indicating that the model is not performing very well for any individual class. This suggests that more work may be needed to improve the model's performance, such as further feature engineering or trying different algorithms.

In this case, the MSE value is relatively low, indicating good performance of the model. The RMSE value is also low, indicating that the model has low variance and is able to make accurate predictions consistently. The MAE value is also relatively low, indicating good performance of the model. Overall, these metrics suggest that the model is performing well in predicting customer ratings.

In [None]:
# import necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# create a new DataFrame with only the relevant columns
df_regression = df[['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Total']]

# apply one-hot encoding to the categorical columns
df_regression = pd.get_dummies(df_regression)

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_regression.drop('Total', axis=1), df_regression['Total'], test_size=0.2)

# initialize the Random Forest model
rf = RandomForestRegressor()

# fit the model to the training data
rf.fit(X_train, y_train)

# predict the total sales for the test data
predicted_totals = rf.predict(X_test)

# calculate the accuracy of the predictions
accuracy = rf.score(X_test, y_test)
print('Accuracy:', accuracy)

The accuracy value of 0.9999 indicates that the model has made highly precise predictions on the test data, suggesting that it has successfully learned the relationship between the input features and the target variable, Total sales. This implies that the model can be useful in predicting total sales for new data and in improving our marketing and sales strategies to increase revenue.

It is essential to bear in mind, though, that while accuracy is a significant indicator of model performance, it is not the sole criterion to consider. Other metrics, such as precision, recall, and F1 score, can also be used to assess the effectiveness of a regression model. These metrics can assist in evaluating the model's performance and making adjustments to increase its accuracy.

In [None]:
# create a new DataFrame with only the relevant columns
df_new = df[['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Payment', 'cogs', 'gross margin percentage', 'gross income']]

# apply one-hot encoding to the categorical columns
df_new = pd.get_dummies(df_new)

# predict the total sales for the new data
predicted_totals = rf.predict(df_new)

# print the predicted total sales
print(predicted_totals)

In [None]:
# create a dataframe with the predictions and the index from the original dataset
predicted_df = pd.DataFrame(predicted_totals, columns=['predicted_total_sales'])
predicted_df.index = df.index

# save the dataframe to a CSV file
predicted_df.to_csv('predicted_totals.csv')

In [None]:
# plot a histogram of predicted sales
plt.hist(predicted_df['predicted_total_sales'], bins=10)
plt.title('Histogram of Predicted Sales');

In [None]:
# plot a scatter plot of predicted sales vs actual sales
plt.scatter(predicted_df['predicted_total_sales'], df['Total'], linewidth=0.1)
plt.xlabel('Predicted Sales')
plt.ylabel('Actual Sales')
plt.title('Predicted vs Actual Sales');

In [None]:
# plot a bar chart of predicted sales by product line
plt.bar(df['Product line'], predicted_df['predicted_total_sales'])
plt.xticks(rotation=90)
plt.xlabel('Product Line')
plt.ylabel('Predicted Sales')
plt.title('Predicted Sales by Product Line');

### Conclusion

Based on the analysis of the supermarket dataset, the following recommendations can be made:

To maximize profits, efforts should be focused on increasing the average unit price and gross margin percentage. Additionally, promoting product lines that have high quantity sold can also boost revenue.

Targeting specific cities and customer types can increase the number of transactions. Improving the supermarket rating by gender can also lead to increased customer satisfaction and loyalty.

Implementing strategies to minimize the amount of tax paid can improve overall profitability.

The Random Forest regression algorithm performed well in predicting total sales for each city and should be used in future sales forecasting. Incorporating other relevant attributes in the dataset can further improve prediction accuracy and aid in making informed business decisions.

However, the Random Forest classification algorithm performed relatively poorly in predicting customer ratings. It is recommended to explore alternative algorithms or incorporate additional relevant data to improve prediction accuracy and gain insights into customer preferences for enhancing customer satisfaction and loyalty.

Overall, these recommendations can help the supermarket make data-driven decisions and allocate resources effectively, leading to improved profitability and customer satisfaction.