In [None]:
'''
Project Overview[Walmart: E-commerce Returns Customer Segmentation Model]:

Goals:To develop a predictive approach to understanding customer return behaviors across different time periods
To leverage transaction data to create a comprehensive view of customer return likelihood....

'''

In [2]:
# Importing necessary libraries for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Getting the Dataset
customer_returns = pd.read_csv(r'D:\Data Journey\Python-Summer-Party\DataSets\customer_returns.csv')

# Display the first few rows to understand the data
print(customer_returns.head(10))  # Shows the first 5 rows with columns
print(list(customer_returns.columns))
print('Number of rows and columns is:', customer_returns.shape)

  order_id  order_date customer_id  return_flag  order_amount
0  ORD0001    7/5/2024     CUST001         True         120.5
1  ORD0002   7/10/2024     CUST002        False          75.0
2  ORD0003   8/15/2024     CUST001         True          90.0
3  ORD0004    9/1/2024     CUST003        False          45.0
4  ORD0005  10/20/2024     CUST004         True         200.0
5  ORD0006  11/11/2024     CUST002         True           NaN
6  ORD0007  11/15/2024     CUST005        False          60.0
7  ORD0008   12/5/2024     CUST006         True         150.0
8  ORD0009  12/25/2024     CUST007        False          85.0
9  ORD0010   1/10/2025     CUST001         True         130.0
['order_id', 'order_date', 'customer_id', 'return_flag', 'order_amount']
Number of rows and columns is: (58, 5)


In [13]:
# Question One
# Unique customer IDs who have made returns between July 1st 2024 and June 30th 2025
# 0. Converting to datetime format
customer_returns['order_date'] = pd.to_datetime(customer_returns['order_date'], errors='coerce')
customer_returns = customer_returns.dropna(subset=['order_date'])

# 1. Filtering for July 1st 2024 & June 30th 2025
July2024_June2025_trxn = customer_returns [(customer_returns['order_date'] >= '2024-07-01') & (customer_returns['order_date'] <= '2025-06-30') &(customer_returns['return_flag'] == True)]

# 2. Get unique customer IDs
unique_customer_ids = July2024_June2025_trxn ['customer_id'].unique()

print("Unique customer IDs who made returns:")
print(unique_customer_ids)

Unique customer IDs who made returns:
['CUST001' 'CUST004' 'CUST002' 'CUST006' 'CUST009' 'CUST005' 'CUST003'
 'CUST007']


In [None]:
# Question Two
# 1. Convert 'order_date' to datetime
customer_returns['order_date'] = pd.to_datetime(customer_returns['order_date'], errors='coerce')

# 2. Set a MultiIndex using 'customer_id' and 'order_date'
customer_returns_multi = customer_returns.set_index(['customer_id', 'order_date'])

# 3. Extract month from the order_date (we can access it even in the MultiIndex)
customer_returns_multi['month'] = customer_returns_multi.index.get_level_values('order_date').month

# 4. Filter only rows where return_flag is True
returns_only = customer_returns_multi[customer_returns_multi['return_flag'] == True]

# 5. Group by customer and month, and count number of returns
grouped = returns_only.groupby(['customer_id', 'month']).size().reset_index(name='return_count')

print(grouped.head())


  customer_id  month  return_count
0     CUST001      1             2
1     CUST001      3             1
2     CUST001      6             1
3     CUST001      7             2
4     CUST001      8             1
