In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('Customer_Segmentation_py.csv')
df

In [None]:
# Q1: Distribution of customers across different age groups (Age_Group)

age_group_counts = df['Age_Group'].value_counts().sort_index()
print("Q1: Distribution of customers across Age Groups")
print(age_group_counts)

In [None]:
# Q2: Customer_Gender variation across different countries or states

gender_country = pd.crosstab(df['Country'], df['Customer_Gender'])
gender_state = pd.crosstab(df['State'], df['Customer_Gender'])

print("Q2: Customer Gender Variation by Country")
print(gender_country)
print("\nQ2: Customer Gender Variation by State")
print(gender_state)

In [None]:
# Q3: Most popular Product_Category and Sub_Category in terms of Order_Quantity

category_popularity = df.groupby('Product_Category')['Order_Quantity'].sum().sort_values(ascending=False)
subcategory_popularity = df.groupby('Sub_Category')['Order_Quantity'].sum().sort_values(ascending=False)

print("Q3: Most Popular Product_Category")
print(category_popularity.head(1))
print("\nQ3: Most Popular Sub_Category")
print(subcategory_popularity.head(1))

In [None]:
# Q4: Segment customers using K-Means clustering

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("Customer_Segmentation_py.csv")

# Select features for clustering
X = df[['Order_Quantity', 'Profit', 'Revenue']]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# K-Means clustering (3 clusters as example)
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

print("Q4: Cluster value counts")
print(df['Cluster'].value_counts())

In [None]:
# Q5 — Characteristics of each cluster

cluster_characteristics = df.groupby('Cluster')[['Order_Quantity', 'Profit', 'Revenue']].mean()

print("Q5: Average metrics per cluster")
print(cluster_characteristics)

In [None]:
# Q6: Which Product_Category or Sub_Category generates the highest Profit and Revenue

category_profit = df.groupby('Product_Category')[['Profit', 'Revenue']].sum().sort_values(by='Profit', ascending=False)
subcategory_profit = df.groupby('Sub_Category')[['Profit', 'Revenue']].sum().sort_values(by='Profit', ascending=False)

print("Q6: Product_Category with Highest Profit and Revenue")
print(category_profit.head(1))
print("\nQ6: Sub_Category with Highest Profit and Revenue")
print(subcategory_profit.head(1))

In [None]:
# Q7 — Relationship between Unit_Cost, Unit_Price, and Profit

correlation_matrix = df[['Unit_Cost', 'Unit_Price', 'Profit']].corr()

print("Q7: Correlation between Unit_Cost, Unit_Price, and Profit")
print(correlation_matrix)

In [None]:
# Q8 — Which Age_Group contributes most to Revenue and Profit

age_group_contrib = df.groupby('Age_Group')[['Revenue', 'Profit']].sum().sort_values(by='Revenue', ascending=False)

print("Q8: Age Group Contribution to Revenue and Profit")
print(age_group_contrib.head(1))

In [None]:
# Q9 — How does customer spending (Revenue) differ between Customer_Gender groups

gender_revenue = df.groupby('Customer_Gender')['Revenue'].mean()

print("Q9: Average Revenue by Customer Gender")
print(gender_revenue)

In [None]:
# Q10 — Analyze the monthly or yearly trends in Revenue and Profit

yearly_trends = df.groupby('Year')[['Revenue', 'Profit']].sum()
monthly_trends = df.groupby('Month')[['Revenue', 'Profit']].sum()

print("Q10: Yearly Trends in Revenue and Profit")
print(yearly_trends)
print("\nQ10: Monthly Trends in Revenue and Profit")
print(monthly_trends)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Features and target
X = df[['Customer_Age', 'Unit_Cost', 'Unit_Price', 'Order_Quantity', 'Revenue']]
y = df['Profit']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on test set
y_pred = lr.predict(X_test)


print("\n--- Predict Profit Based on Your Input ---")
cust_age = float(input("Enter Customer Age: "))
unit_cost = float(input("Enter Unit Cost: "))
unit_price = float(input("Enter Unit Price: "))
order_qty = float(input("Enter Order Quantity: "))
revenue = float(input("Enter Revenue: "))

# prediction
user_input = pd.DataFrame([[cust_age, unit_cost, unit_price, order_qty, revenue]],
                          columns=X.columns)
predicted_profit = lr.predict(user_input)[0]

print(f"\nPredicted Profit: {predicted_profit:.2f}")


In [None]:
# Q13 — Visualize the customer distribution by Country and State

import matplotlib.pyplot as plt

df = pd.read_csv("Customer_Segmentation_py.csv")

# Country distribution
country_counts = df['Country'].value_counts()

plt.figure(figsize=(8, 5))
country_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Customer Distribution by Country")
plt.xlabel("Country")
plt.ylabel("Number of Customers")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


state_counts = df['State'].value_counts()

plt.figure(figsize=(10, 6))
state_counts.plot(kind='bar', color='orange', edgecolor='black')
plt.title(" Customer distribution by state")
plt.xlabel("State")
plt.ylabel("Number of Customers")
plt.xticks(rotation=75)
plt.tight_layout()
plt.show()

In [None]:
# Q14 — Bar chart showing Revenue or Profit for each Product_Category

category_revenue_profit = df.groupby('Product_Category')[['Revenue', 'Profit']].sum().sort_values(by='Revenue', ascending=False)

# Revenue plot
plt.figure(figsize=(8, 5))
category_revenue_profit['Revenue'].plot(kind='bar', color='green', edgecolor='black')
plt.title("Q14: Revenue by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Revenue")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Profit plot
plt.figure(figsize=(8, 5))
category_revenue_profit['Profit'].plot(kind='bar', color='purple', edgecolor='black')
plt.title("Q14: Profit by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Profit")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()