In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/OnlineRetail.csv", encoding='latin1')

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
df.shape

(541909, 8)

In [5]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [6]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [7]:
df.iloc[0]

InvoiceNo                                  536365
StockCode                                  85123A
Description    WHITE HANGING HEART T-LIGHT HOLDER
Quantity                                        6
InvoiceDate                        12/1/2010 8:26
UnitPrice                                    2.55
CustomerID                                17850.0
Country                            United Kingdom
Name: 0, dtype: object

In [8]:
df = df.dropna(subset=['CustomerID'])

In [9]:
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

In [10]:
df = df[df['Quantity'] > 0]

In [11]:
df['CustomerID'] = df['CustomerID'].astype(int)

In [12]:
df.shape

(397924, 8)

In [13]:
user_item_matrix = df.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum'
)

In [14]:
user_item_matrix = user_item_matrix.fillna(0)

In [15]:
user_item_matrix.shape

(4339, 3665)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(user_item_matrix)

In [17]:
import pandas as pd
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

In [18]:
def recommend_products(customer_id, num_recommendations=5):
    similar_users = user_similarity_df[customer_id].sort_values(ascending=False)[1:11]
    recommended_products = (
        user_item_matrix.loc[similar_users.index]
        .mean()
        .sort_values(ascending=False)
        .head(num_recommendations)
    )
    return recommended_products

In [19]:
user_item_matrix.index[:5]

Index([12346, 12347, 12348, 12349, 12350], dtype='int32', name='CustomerID')

In [20]:
recommend_products(customer_id=12346)

StockCode
23166    33.2
23167    27.4
23165     8.8
22962     8.4
47566     6.2
dtype: float64

In [21]:
product_map = (
    df[['StockCode', 'Description']]
    .drop_duplicates()
    .set_index('StockCode')['Description']
    .to_dict()
)

In [29]:
def recommend_products_with_names(customer_id, num_recommendations=5):
    similar_users = user_similarity_df[customer_id].sort_values(ascending=False)[1:11]
    product_scores = (
        user_item_matrix.loc[similar_users.index]
        .mean()
        .sort_values(ascending=False)
        .head(num_recommendations)
    )
    recommendations = product_scores.reset_index()
    recommendations.columns = ['StockCode', 'Score']
    recommendations['ProductName'] = recommendations['StockCode'].map(product_map)
    return recommendations

In [31]:
recommend_products_with_names(customer_id=12346)

Unnamed: 0,StockCode,Score,ProductName
0,23166,33.2,MEDIUM CERAMIC TOP STORAGE JAR
1,23167,27.4,SMALL CERAMIC TOP STORAGE JAR
2,23165,8.8,LARGE CERAMIC TOP STORAGE JAR
3,22962,8.4,JAM JAR WITH PINK LID
4,47566,6.2,PARTY BUNTING
