# Data Science Assignment: eCommerce Transactions Dataset
## Overview
This notebook contains the complete solution for the tasks mentioned in the assignment, including:
1. Exploratory Data Analysis (EDA) and Business Insights.
2. Lookalike Model development to find similar customers.
3. Customer Segmentation using clustering techniques.

---
### Files Description:
- **Customers.csv**: Contains customer profile information.
- **Products.csv**: Contains product details.
- **Transactions.csv**: Contains transaction history.

---
### Deliverables:
- EDA report and insights.
- Lookalike recommendations for the first 20 customers.
- Customer Segmentation with clustering evaluation metrics and visualization.

---


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score

# Set display options
pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

# Load datasets (Ensure these files are in the same directory as this notebook)
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [None]:
# Task 1: Exploratory Data Analysis (EDA)

# Display basic information about the datasets
print("Customers Dataset:")
print(customers.info(), "\n")

print("Products Dataset:")
print(products.info(), "\n")

print("Transactions Dataset:")
print(transactions.info(), "\n")

# Check for missing values
print("Missing Values:")
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

# Visualizations and insights
# 1. Distribution of Customers by Region
plt.figure(figsize=(8, 5))
sns.countplot(data=customers, x='Region', palette='viridis')
plt.title("Distribution of Customers by Region")
plt.show()

# 2. Top Products by Sales
top_products = transactions.groupby('ProductID')['Quantity'].sum().sort_values(ascending=False).head(10)
top_products = top_products.reset_index().merge(products, on='ProductID')
plt.figure(figsize=(10, 5))
sns.barplot(data=top_products, x='ProductName', y='Quantity', palette='mako')
plt.title("Top 10 Products by Sales")
plt.xticks(rotation=45, ha='right')
plt.show()

# Additional insights can be derived similarly.


In [None]:
# Task 2: Lookalike Model

# Combine transaction data with customer profiles
customer_transactions = transactions.merge(customers, on='CustomerID')

# Create a pivot table for customer-product interactions
customer_product_matrix = customer_transactions.pivot_table(index='CustomerID', columns='ProductID', values='Quantity', fill_value=0)

# Calculate cosine similarity between customers
similarity_matrix = pd.DataFrame(cosine_similarity(customer_product_matrix), index=customer_product_matrix.index, columns=customer_product_matrix.index)

# Get the top 3 lookalike customers for the first 20 customers
lookalike_results = {}
for customer in customers['CustomerID'][:20]:
    similar_customers = similarity_matrix[customer].sort_values(ascending=False)[1:4]  # Exclude self-similarity
    lookalike_results[customer] = similar_customers.to_dict()

# Convert results to a DataFrame
lookalike_df = pd.DataFrame([{'CustomerID': cust, 'Lookalikes': lookalikes} for cust, lookalikes in lookalike_results.items()])

# Save results
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)


In [None]:
# Task 3: Customer Segmentation

# Preprocess data for clustering
customer_features = customers.merge(transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index(), on='CustomerID')
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 2:])  # Exclude non-numeric columns

# Perform KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)  # Choosing 4 clusters as an example
clusters = kmeans.fit_predict(scaled_features)

# Add cluster labels to the dataset
customer_features['Cluster'] = clusters

# Evaluate clustering using Davies-Bouldin Index
db_index = davies_bouldin_score(scaled_features, clusters)
print(f"Davies-Bouldin Index: {db_index}")

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=customer_features, x='TotalValue', y='Quantity', hue='Cluster', palette='viridis', s=100)
plt.title("Customer Segments")
plt.show()
