In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
# 1. Load and Preprocess Data
df = pd.read_csv('Online Retail.csv', encoding='ISO-8859-1')

In [3]:
# Step 3: Basic Dataset Overview
print("Shape of dataset:", df.shape)
print("\nFirst 5 rows of data:")
display(df.head())

Shape of dataset: (541909, 8)

First 5 rows of data:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


In [4]:
# Step 4: Dataset Information
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
# Step 5: Summary Statistics
print("\nSummary Statistics:")
display(df.describe(include='all'))


Summary Statistics:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
count,541909.0,541909,540455,541909.0,541909,541909.0,406829.0,541909
unique,25900.0,4070,4223,,23260,,,38
top,573585.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,,31-10-2011 14:41,,,United Kingdom
freq,1114.0,2313,2369,,1114,,,495478
mean,,,,9.55225,,4.611114,15287.69057,
std,,,,218.081158,,96.759853,1713.600303,
min,,,,-80995.0,,-11062.06,12346.0,
25%,,,,1.0,,1.25,13953.0,
50%,,,,3.0,,2.08,15152.0,
75%,,,,10.0,,4.13,16791.0,


In [6]:
# Step 6: Check for Duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 5268


In [7]:
df.drop_duplicates(inplace=True)

In [8]:
# Step 7: Check for Missing Values
print("\nMissing Values:")
display(df.isnull().sum())


Missing Values:


Unnamed: 0,0
InvoiceNo,0
StockCode,0
Description,1454
Quantity,0
InvoiceDate,0
UnitPrice,0
CustomerID,135037
Country,0


In [9]:
df.fillna(0, inplace=True)

In [10]:
# Verify that missing values have been handled
print("\nMissing Values after fillna:")
display(df.isnull().sum())


Missing Values after fillna:


Unnamed: 0,0
InvoiceNo,0
StockCode,0
Description,0
Quantity,0
InvoiceDate,0
UnitPrice,0
CustomerID,0
Country,0


In [11]:
# Drop missing customer IDs and remove negative quantities
df = df.dropna(subset=['CustomerID'])
df = df[df['Quantity'] > 0]

In [12]:
#Use a subset for speed during testing
# df = df[df['Country'] == 'United Kingdom']

# Create the user-item matrix
basket = df.pivot_table(index='CustomerID', columns='Description', values='Quantity', aggfunc='sum').fillna(0)

In [13]:
# Convert to sparse matrix for performance
user_sparse = csr_matrix(basket.values)
item_sparse = csr_matrix(basket.T.values)

In [14]:
# 2. User-User Collaborative Filtering
# ----------------------------------------------
user_similarity = cosine_similarity(user_sparse)
user_sim_df = pd.DataFrame(user_similarity, index=basket.index, columns=basket.index)

def recommend_user_based(user_id, n=5):
    if user_id not in user_sim_df:
        return "User not found."
    similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:n+1]
    recommended_items = basket.loc[similar_users.index].mean().sort_values(ascending=False).head(n)
    return recommended_items

In [15]:
# 3. Item-Item Collaborative Filtering
# ----------------------------------------------
item_similarity = cosine_similarity(item_sparse)
item_sim_df = pd.DataFrame(item_similarity, index=basket.columns, columns=basket.columns)

def recommend_item_based(item_name, n=5):
    if item_name not in item_sim_df:
        return "Item not found."
    similar_items = item_sim_df[item_name].sort_values(ascending=False)[1:n+1]
    return similar_items

In [16]:
def recommend_user_based(user_id, n=5):
    if user_id not in user_sim_df.index: # Check if user_id is in the DataFrame index
        return "User not found."
    # Use iloc for position-based slicing to get the top n similar users (excluding self)
    similar_users = user_sim_df[user_id].sort_values(ascending=False).iloc[1:n+1]
    # The rest of the logic remains the same
    recommended_items = basket.loc[similar_users.index].mean().sort_values(ascending=False).head(n)
    return recommended_items

print("\n--- User-Based Recommendations for Customer 17850 ---")
# The customer ID in the original data is an integer, but after dropna,
# the index of the basket DataFrame becomes a float.
# So, pass the customer ID as a float to match the index type.
print(recommend_user_based(17850.0))

print("\n--- Item-Based Recommendations for 'HAND WARMER RED POLKA DOT' ---")
print(recommend_item_based('HAND WARMER RED POLKA DOT'))


--- User-Based Recommendations for Customer 17850 ---
Description
WHITE HANGING HEART T-LIGHT HOLDER    114.8
WOODEN PICTURE FRAME WHITE FINISH      31.2
WOODEN FRAME ANTIQUE WHITE             31.2
RED HANGING HEART T-LIGHT HOLDER       20.0
T-LIGHT HOLDER SWEETHEART HANGING      11.2
dtype: float64

--- Item-Based Recommendations for 'HAND WARMER RED POLKA DOT' ---
Description
IVORY EMBROIDERED QUILT              0.814799
GLASS STAR FROSTED T-LIGHT HOLDER    0.797256
VINTAGE BILLBOARD DRINK ME MUG       0.442072
WHITE METAL LANTERN                  0.275877
WOOD S/3 CABINET ANT WHITE FINISH    0.253403
Name: HAND WARMER RED POLKA DOT, dtype: float64
