In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
data = pd.read_csv('/content/shopping_trends_updated(in) (1).csv')


In [6]:
print("Dataset Information:")
data.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used    

In [7]:
data.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [32]:
print(data.isnull().sum())


Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
Age Group                 0
dtype: int64


In [None]:
data['Purchase Amount (USD)'] = data['Purchase Amount (USD)'].fillna(data['Purchase Amount (USD)'].median())

data = data.dropna(subset=['Age', 'Gender'])

print("Dataset after Handling Missing Values:")
data.info()


In [10]:
data['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [11]:
data['Category'].unique()


array(['Clothing', 'Footwear', 'Outerwear', 'Accessories'], dtype=object)

In [34]:
data['Location'].unique()


array(['Kentucky', 'Maine', 'Massachusetts', 'Rhode Island', 'Oregon',
       'Wyoming', 'Montana', 'Louisiana', 'West Virginia', 'Missouri',
       'Arkansas', 'Hawaii', 'Delaware', 'New Hampshire', 'New York',
       'Alabama', 'Mississippi', 'North Carolina', 'California',
       'Oklahoma', 'Florida', 'Texas', 'Nevada', 'Kansas', 'Colorado',
       'North Dakota', 'Illinois', 'Indiana', 'Arizona', 'Alaska',
       'Tennessee', 'Ohio', 'New Jersey', 'Maryland', 'Vermont',
       'New Mexico', 'South Carolina', 'Idaho', 'Pennsylvania',
       'Connecticut', 'Utah', 'Virginia', 'Georgia', 'Nebraska', 'Iowa',
       'South Dakota', 'Minnesota', 'Washington', 'Wisconsin', 'Michigan'],
      dtype=object)

In [13]:
duplicates = data.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")

data = data.drop_duplicates()



Number of Duplicate Rows: 0


In [None]:
data['Shipping Type'].value_counts()


In [None]:
data['Payment Method'].value_counts()


In [None]:
data['Review Rating'].describe()


In [None]:
data['Size'].describe()

In [None]:
data['Season'].value_counts()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['Purchase Amount (USD)'], kde=True, color='blue', bins=20)
plt.title("Distribution of Purchase Amounts", fontsize=14)
plt.xlabel("Purchase Amount (USD)")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=data, x='Category', y='Purchase Amount (USD)', ci=None, palette="coolwarm")
plt.title("Average Purchase Amount by Category", fontsize=14)
plt.xlabel("Category")
plt.ylabel("Average Purchase Amount (USD)")
plt.xticks(rotation=45)
plt.show()


In [None]:
category_revenue = data.groupby('Category')['Purchase Amount (USD)'].sum().sort_values(ascending=False).reset_index()

print("Category with Highest Revenue:")
print(category_revenue.head(1))

plt.figure(figsize=(10, 6))
sns.barplot(x='Category', y='Purchase Amount (USD)', data=category_revenue, palette="muted")
plt.title("Revenue by Product Category")
plt.xlabel("Category")
plt.ylabel("Total Revenue (USD)")
plt.xticks(rotation=45)
plt.show()


In [None]:
seasonal_trends = data.groupby('Season')['Purchase Amount (USD)'].mean().reset_index()

print("Average Purchase Amount by Season:")
print(seasonal_trends)

plt.figure(figsize=(8, 6))
sns.lineplot(x='Season', y='Purchase Amount (USD)', data=seasonal_trends, marker='o', color='teal')
plt.title("Seasonal Trends in Purchases")
plt.xlabel("Season")
plt.ylabel("Average Purchase Amount (USD)")
plt.grid(True)
plt.show()


In [None]:
bins = [0, 18, 30, 45, 60, 100]
labels = ['<18', '18-30', '31-45', '46-60', '60+']
data['Age Group'] = pd.cut(data['Age'], bins=bins, labels=labels)

age_group_sales = data.groupby('Age Group')['Purchase Amount (USD)'].sum().reset_index()

print("Sales by Age Group:")
print(age_group_sales)

plt.figure(figsize=(8, 6))
sns.barplot(x='Age Group', y='Purchase Amount (USD)', data=age_group_sales, palette="grey")
plt.title("Total Sales by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Total Sales (USD)")
plt.show()


In [None]:
payment_method_counts = data['Payment Method'].value_counts().reset_index()
payment_method_counts.columns = ['Payment Method', 'Count']

print("Most Preferred Payment Method:")
print(payment_method_counts.head(1))

plt.figure(figsize=(8, 6))
sns.barplot(x='Payment Method', y='Count', data=payment_method_counts, palette="cubehelix")
plt.title("Preferred Payment Methods")
plt.xlabel("Payment Method")
plt.ylabel("Number of Purchases")
plt.xticks(rotation=45)
plt.show()


**Who made the highest number of purchases, and how much did they spend in total?**


In [35]:
top_customer = data.groupby('Customer ID')['Purchase Amount (USD)'].agg(['count', 'sum']).sort_values(by='sum', ascending=False).reset_index()

print(f"The customer with ID {top_customer.iloc[0]['Customer ID']} made the highest purchases, with a total spend of ${top_customer.iloc[0]['sum']:.2f} across {top_customer.iloc[0]['count']} purchases.")


The customer with ID 2843 made the highest purchases, with a total spend of $100.00 across 1 purchases.


**Which product category has the lowest sales, and why?**



In [36]:
category_sales = data.groupby('Category')['Purchase Amount (USD)'].sum().sort_values()

lowest_category = category_sales.index[0]
lowest_sales = category_sales.iloc[0]
print(f"The lowest-performing category is '{lowest_category}' with total sales of ${lowest_sales:.2f}.")


The lowest-performing category is 'Outerwear' with total sales of $18524.00.


 **What is the average purchase amount by gender?**

In [37]:
avg_purchase_by_gender = data.groupby('Gender')['Purchase Amount (USD)'].mean()

for gender, avg_purchase in avg_purchase_by_gender.items():
    print(f"The average purchase amount for {gender} customers is ${avg_purchase:.2f}.")


The average purchase amount for Female customers is $60.25.
The average purchase amount for Male customers is $59.54.


**Are there any outliers in purchase amounts?**

In [38]:
q1 = data['Purchase Amount (USD)'].quantile(0.25)
q3 = data['Purchase Amount (USD)'].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[(data['Purchase Amount (USD)'] < lower_bound) | (data['Purchase Amount (USD)'] > upper_bound)]

print(f"There are {len(outliers)} outliers in the dataset. Most extreme outlier: ${outliers['Purchase Amount (USD)'].max():.2f}.")


There are 0 outliers in the dataset. Most extreme outlier: $nan.


**How does the average purchase amount vary across different regions?**

In [None]:
avg_purchase_by_region = data.groupby('Location')['Purchase Amount (USD)'].mean()

for region, avg in avg_purchase_by_region.items():
    print(f"Location: {region}, Average Purchase: ${avg:.2f}")
