# Description of the Notebook: 
## Purpose:
#### This Jupyter Notebook analyzes sales data from a retail store using Python libraries such as Pandas, NumPy, Matplotlib, and Seaborn. The key objectives include:
##### >Data Cleaning: Identifying and removing duplicates, handling missing values, and ensuring data consistency.
##### >Exploratory Data Analysis (EDA): Visualizing trends and patterns related to sales, profits, and customer behavior.
##### >Insights Extraction: Gaining actionable insights to understand product performance and optimize sales strategies.



## Importing required libiraries

In [2]:
from numpy import *
from pandas import *
import seaborn as sns
import matplotlib.pyplot as plt



## Reading data 

In [3]:
Sales_data=read_csv(r"C:\Users\DELL\Documents\Sales\superstore_dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\DELL\\Documents\\Sales\\superstore_dataset.csv'

In [None]:
Sales_data.head()

In [None]:
Sales_data.shape

# Data Cleaning and Preprocessing

In [None]:
Sales_data.columns

In [None]:
# Checking the data for null values
Sales_data.isnull().sum()

In [None]:
#Checking the data for duplicate values
Sales_data.duplicated().sum()

In [None]:
# Dropping duplicate values 
Sales_data=Sales_data.drop_duplicates()

In [None]:
Sales_data.shape

In [None]:
# Visualising all numerical data using Box plot to check data for outliers
numerical_cols =Sales_data.select_dtypes(include=['number'])
# Set the style for the plots
sns.set(style="whitegrid")
# Create a box plot for each numerical column
plt.figure(figsize=(15, 8))  # Adjust the figure size as needed
for i, column in enumerate(numerical_cols.columns, 1):
    plt.subplot(1, len(numerical_cols.columns), i)  # Create subplots
    sns.boxplot(y=Sales_data[column], color="skyblue")
    plt.title(column)
    plt.xlabel("Value")
    plt.ylabel(column)

In [None]:
# Using scattering plot to discover outliers in numerical features
sns.pairplot(Sales_data.select_dtypes(include='number'))
plt.show()

In [None]:
Sales_data.info()

In [None]:
# Adjusting data types
Sales_data["order_date"]=to_datetime(Sales_data['order_date'], errors='coerce')
Sales_data["ship_date"]=to_datetime(Sales_data['ship_date'], errors='coerce')
Sales_data["discount"]=Sales_data["discount"].astype(str)

In [None]:
# Creating new features (Year,Month,Day) from order date
Sales_data['Year'] = Sales_data['order_date'].dt.year
Sales_data['Month'] = Sales_data['order_date'].dt.month
Sales_data['Day'] = Sales_data['order_date'].dt.day

In [None]:
Sales_data.info()

# The previous section included cleaning the data to be ready for analysis and visualisation.
### >The data included 0 null values.
### >There are no outliers in the data.
### >1 raw dropped due to duplicated values.
### >Date and time features were adjusted and created new features like (Year-Month-Day) to be suitable for time series analysis. 

# 

# Exploratory data analysis (EDA)

In [None]:
# Some statistical values for numerical features
Sales_data.describe().T

### The effect of Discount on both sales and profits

In [None]:
#Defining the number of unique values in discount feature
Sales_data["discount"].unique()

In [None]:
# Calculating the average sales and profits for each discount
Discount_Sales_Profit_relation=Sales_data.groupby(["discount"])["sales","profit"].mean()

In [None]:
Discount_Sales_Profit_relation

In [None]:
# Visualising the result 
plt.plot(Discount_Sales_Profit_relation.index,Discount_Sales_Profit_relation["sales"],color='skyblue', alpha=0.7,label='Sales')
plt.plot(Discount_Sales_Profit_relation.index,Discount_Sales_Profit_relation["profit"],color='red', marker='o',label='Profits')
plt.title("Average sales and Profits over Discount")
plt.ylabel("Sales-Profits")
plt.xlabel("Discount")
plt.legend(loc='best')
plt.show()

### For the previous relation, As the discount increase, The sales increase but the profits decrease.
### As a result we we shouldn't increase the discount over  20% to avoid negative profits.

# 

### Sum sales and profits per each Segment 

In [None]:
# number of unique values per each segment
Sales_data["segment"].unique()

In [None]:
# Grouping data to calculate total Sales and profits per each segment
Sales_Profits_Segment=Sales_data.groupby(["segment"])["sales","profit"].sum()

In [None]:
Sales_Profits_Segment

In [None]:
# Visualising the relation between sales and profits and each segment
plt.style.use("_classic_test_patch")
plt.bar(Sales_Profits_Segment.index,Sales_Profits_Segment["sales"],color='skyblue',alpha=0.7,label='Sales')
plt.plot(Sales_Profits_Segment.index,Sales_Profits_Segment["profit"],color='red',marker='o',label='Profit')
plt.title("Sales and Profits over Segment")
plt.ylabel("Sales-Profits")
plt.xlabel("Segement ")
plt.legend(loc='best')
plt.show()

## From previous relation: 
### >Consumer has the highest sales and profits.
### >Home office has the lowest sales and profit.

# 

### Sales, Profits and N of orders per each category 

In [None]:
# N of unique categories
Sales_data["category"].unique()

In [None]:
# Calculating the number of orders per eacg category
Orders_per_each_category=Sales_data.groupby(["category"])["order_id"].count().to_frame(name="N of orders")

In [None]:
Orders_per_each_category

In [None]:
# Visualising the number of orders per each category
plt.pie(Orders_per_each_category["N of orders"],labels=Orders_per_each_category.index)
plt.title("N of orders per each Caategory")

In [None]:
#Grouping data to calc the sales and profits per each category
Profits_per_each_category=Sales_data.groupby(["category"])["sales","profit"].sum()

In [None]:
plt.style.use("_classic_test_patch")
plt.bar(Profits_per_each_category.index,Profits_per_each_category["sales"],color='skyblue',alpha=0.7,label='Sales')
plt.plot(Profits_per_each_category.index,Profits_per_each_category["profit"],color='red',marker='o',label='Profit')
plt.title("Sales and Profits over Category")
plt.ylabel("Sales-Profits")
plt.xlabel("Category")
plt.legend(loc='best')
plt.show()

### From the previous relations:
#### >Office supplies have the highest number of orders and moderate sales and profits.
#### >Technology has the highest sales and profits.

# 

### Subcategories with highest sales 

In [None]:
# Top 5 subcategories with highest sales and profits
Top_5_subcategories=Sales_data.groupby(["subcategory"])["sales","profit"].sum().sort_values(by="sales",ascending=False).head(5)

In [None]:
Top_5_subcategories

In [None]:
plt.style.use("_classic_test_patch")
plt.bar(Top_5_subcategories.index,Top_5_subcategories["sales"],color='skyblue',alpha=0.7,label='Sales')
plt.plot(Top_5_subcategories.index,Top_5_subcategories["profit"],color='red',marker='o',label='Profit')
plt.title("Top_5_subcategories")
plt.ylabel("Sales-Profits")
plt.xlabel("Product ")
plt.legend(loc='best')
plt.show()

In [None]:
#Subcategories with lowest sales and profits
Subcategories_with_Lowest_sales=Sales_data.groupby(["subcategory"])["sales","profit"].sum().sort_values(by="sales",ascending=True).head(5)

In [None]:
Subcategories_with_Lowest_sales

In [None]:
plt.style.use("_classic_test_patch")
plt.bar(Subcategories_with_Lowest_sales.index,Subcategories_with_Lowest_sales["sales"],color='skyblue',alpha=0.7,label='Sales')
plt.plot(Subcategories_with_Lowest_sales.index,Subcategories_with_Lowest_sales["profit"],color='red',marker='o',label='Profit')
plt.title("Subcategories with Lowest Sales")
plt.ylabel("Sales-Profits")
plt.xlabel("Product")
plt.legend(loc='best')
plt.show()

### From previous relations :
#### >Phones and Chairs have the highest sales and profits.
#### >Fasteners and labels have the lowest sales and profits.

# 

### Customers purchasing behavior and customers with highest sales  

In [None]:
# N of customers
Sales_data["customer"].nunique()

In [None]:
# Identifing the customers with highest sales
Customers_with_highest_sales=Sales_data.groupby(["customer"])["sales"].sum().to_frame(name="Total Sales").sort_values(by="Total Sales",ascending=False).head(5)

In [None]:
Customers_with_highest_sales.plot(kind="bar",color="g",alpha=0.7)
plt.title("Customers with highest sales")
plt.xlabel("Customer", fontsize=12)
plt.ylabel("Sales", fontsize=12)

In [None]:
# Customers purchasing behavior
Customers_with_highest_N_orders=Sales_data.groupby(["customer"])["order_id"].count().to_frame(name="Total N of Orders").sort_values(by="Total N of Orders",ascending=False).head(5)

In [None]:
Customers_with_highest_N_orders.plot(kind="bar",color="skyblue",alpha=0.7)
plt.xlabel("Customer", fontsize=12)
plt.ylabel("N of orders", fontsize=12)
plt.title("Customers with highest N of orders")

### From the previous charts:
#### >We have customers with high sales volum and customers with high purchasing behavior

# 

### States with highest sales

In [None]:
# N of states
Sales_data["state"].nunique()

In [None]:
# Determining the states with the highest sales
States_with_highest_sales=Sales_data.groupby(["state"])["sales"].sum().to_frame(name="Total Sales").sort_values(by="Total Sales",ascending=False).head(5)

In [None]:
States_with_highest_sales.plot(kind="bar",color="g")
plt.xlabel("State", fontsize=12)
plt.ylabel("Sales", fontsize=12)
plt.title("states with the highest sales")

### California and NewYork have the highest sales with total sales higher than 300000$ 

# 

## Time series analysis 

In [None]:
# Sales and profits per each month
Monthly_Sales_Profit=Sales_data.groupby(["Year","Month"])["sales","profit"].sum()

In [None]:
Monthly_Sales_Profit

In [None]:
# Using heatmap to distinguish the months with highest sales and months with lowest sales
sns.heatmap(Monthly_Sales_Profit)
plt.title("Months with highest sales and profits")

In [None]:
# Identifing the months with highest number of orders
Months_Highest_sales=Sales_data.groupby(["Year","Month"])["order_id"].count().to_frame(name="N of orders")

In [None]:
sns.heatmap(Months_Highest_sales)
plt.title("Months with highest N of orders")

### From the previous relations:
#### >Sales and profits are very high in the period between September and December. 
#### >Sales and profits are very low in the period between January and March.
#### >Sales and profits are moderate in the rest of the year. 

# 

### Sales and profits trend over months 

In [None]:
# Sales and profits over months
Sales_profits_over_months=Sales_data.groupby(["Year","Month"])[["sales","profit"]].sum().reset_index()

In [None]:
plt.plot(Sales_profits_over_months.index, Sales_profits_over_months["sales"], label="Sales", marker='o', color='blue')
plt.plot(Sales_profits_over_months.index, Sales_profits_over_months["profit"], label="Profit", marker='o', color='green')
plt.title("Monthly Sales and Profit", fontsize=16)
plt.xlabel("Year-Month", fontsize=12)
plt.ylabel("Amount", fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# The relation between sales and profits
plt.style.use("classic")
plt.scatter(Sales_data["sales"],Sales_data["profit"])
plt.title("Sales and Profits relationship", fontsize=16)
plt.xlabel("Sales", fontsize=12)
plt.ylabel("Profits", fontsize=12)

### As the sales increase, profits increase. 

# 

### Discovering other data correlations

In [None]:
correlatios=Sales_data.corr()

In [None]:
correlatios

In [None]:
sns.heatmap(correlatios)

# 

# Conclusion
### This project focused on analyzing sales data from a retail store to uncover trends and actionable insights. Here are the key takeaways:

#### Sales and Profit Trends:
###### >Identified the top-performing products and categories that contributed significantly to overall sales and profits.
###### >Analyzed seasonal trends and observed peak sales periods.

#### Customer Behavior:
###### >Gained insights into customer preferences and buying patterns.
###### >Highlighted key customer demographics contributing to revenue.

#### Operational Insights:
###### >Found inefficiencies, such as products with high sales but low profitability.
###### >Provided recommendations for inventory optimization and sales strategy refinement.

#### Visualization Impact:
###### >Utilized scatter plots, bar charts, and heatmaps to clearly illustrate patterns and correlations within the data.

### By cleaning and analyzing the data, this project has demonstrated the importance of data-driven decision-making in retail sales. These insights can be used to improve overall profitability.