# Retail Connect 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('retailDatasetFin.csv')

Loading Dataset

data.head(2)

In [4]:
print(df.head())

  Invoice StockCode                          Description  Quantity  \
0  536365     71053                  WHITE METAL LANTERN         6   
1  536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
2  536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
3  536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
4  536365     22752         SET 7 BABUSHKA NESTING BOXES         2   

        InvoiceDate  Price  Customer ID         Country  
0  12-01-2010 08:26   3.39      17850.0  United Kingdom  
1  12-01-2010 08:26   2.75      17850.0  United Kingdom  
2  12-01-2010 08:26   3.39      17850.0  United Kingdom  
3  12-01-2010 08:26   3.39      17850.0  United Kingdom  
4  12-01-2010 08:26   7.65      17850.0  United Kingdom  


### Data Processing

In [5]:
df = df.loc[df['Quantity']>0] ## Eliminating data with whose negative quantity

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26829 entries, 0 to 27688
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Invoice      26829 non-null  object 
 1   StockCode    26829 non-null  object 
 2   Description  26783 non-null  object 
 3   Quantity     26829 non-null  int64  
 4   InvoiceDate  26829 non-null  object 
 5   Price        26829 non-null  float64
 6   Customer ID  21545 non-null  float64
 7   Country      26829 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.8+ MB


In [7]:
# Define the new column names using a dictionary
new_column_names = {
    'Invoice': 'InvoiceNo',
    'Price': 'UnitPrice ',
    'Customer ID': 'CustomerID',
}
df.rename(columns = new_column_names,inplace=True)

##### Handling Nan Customer ID


In [8]:
df['CustomerID'].isna().sum()
df = df.dropna(subset=['CustomerID'])

### Creating Customer-Item matrix

In [9]:
customer_item_matrix = df.pivot_table(
    index='CustomerID',
    columns='StockCode',
    values='Quantity',
    aggfunc='sum'
)
customer_item_matrix.loc[12481:].head()

StockCode,10002,10080,10120,10123C,10124G,10125,10133,10135,11001,15034,...,90210B,90214J,90214M,90214S,90214V,BANK CHARGES,C2,DOT,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12481.0,,,,,,,,,,,...,,,,,,,,,,8.0
12492.0,,,,,,,,,,,...,,,,,,,,,,1.0
12498.0,,,,,,,,,,,...,,,,,,,,,,2.0
12500.0,,,,,,,,,,,...,,,,,,,,,,2.0
12518.0,,,,,,,,,,,...,,,,,,,,,,2.0


In [10]:
print(customer_item_matrix.shape)
customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)

(742, 2517)


### Collabrative Filtering

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
# temp = df.groupby('CustomerID').count()['Quantity'] > 20 ## Selecting only Those has   
# temp[temp]

### User Based CF

In [13]:
user_user_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix))
user_user_sim_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,732,733,734,735,736,737,738,739,740,741
0,1.0,0.0,0.0381,0.0,0.040825,0.028868,0.244444,0.074278,0.020412,0.095258,...,0.0,0.060302,0.176505,0.031623,0.16641,0.031623,0.0,0.089443,0.0,0.0
1,0.0,1.0,0.065583,0.258199,0.105409,0.074536,0.086066,0.0,0.052705,0.105409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0381,0.065583,1.0,0.0,0.0,0.0,0.028222,0.023583,0.0,0.034565,...,0.0,0.0,0.049814,0.040161,0.0,0.040161,0.0,0.0,0.0,0.061806
3,0.0,0.258199,0.0,1.0,0.408248,0.288675,0.0,0.0,0.204124,0.136083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.040825,0.105409,0.0,0.408248,1.0,0.117851,0.045361,0.07581,0.083333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#Renaming index and column names

user_user_sim_matrix.columns = customer_item_matrix.index

user_user_sim_matrix['CustomerID'] = customer_item_matrix.index
user_user_sim_matrix = user_user_sim_matrix.set_index('CustomerID')
user_user_sim_matrix.head()

CustomerID,12359.0,12362.0,12371.0,12381.0,12394.0,12395.0,12415.0,12420.0,12427.0,12431.0,...,18180.0,18221.0,18223.0,18228.0,18229.0,18237.0,18239.0,18241.0,18277.0,18287.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12359.0,1.0,0.0,0.0381,0.0,0.040825,0.028868,0.244444,0.074278,0.020412,0.095258,...,0.0,0.060302,0.176505,0.031623,0.16641,0.031623,0.0,0.089443,0.0,0.0
12362.0,0.0,1.0,0.065583,0.258199,0.105409,0.074536,0.086066,0.0,0.052705,0.105409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12371.0,0.0381,0.065583,1.0,0.0,0.0,0.0,0.028222,0.023583,0.0,0.034565,...,0.0,0.0,0.049814,0.040161,0.0,0.040161,0.0,0.0,0.0,0.061806
12381.0,0.0,0.258199,0.0,1.0,0.408248,0.288675,0.0,0.0,0.204124,0.136083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12394.0,0.040825,0.105409,0.0,0.408248,1.0,0.117851,0.045361,0.07581,0.083333,0.111111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
user_user_sim_matrix.loc[12362.0].sort_values(ascending=False).head(10) # Finding the Customers with similar behaviour/preferance

CustomerID
12362.0    1.000000
12948.0    0.322749
13816.0    0.313112
14451.0    0.301297
17295.0    0.287678
17920.0    0.281091
12700.0    0.273690
12633.0    0.266667
12583.0    0.258199
12381.0    0.258199
Name: 12362.0, dtype: float64

### Making Recommendations : (According to Users )

In [16]:
print(customer_item_matrix)

StockCode   10002  10080  10120  10123C  10124G  10125  10133  10135  11001  \
CustomerID                                                                    
12359.0         0      0      0       0       0      0      0      0      0   
12362.0         0      0      0       0       0      0      0      0      0   
12371.0         0      0      0       0       0      0      0      0      0   
12381.0         0      0      0       0       0      0      0      0      0   
12394.0         0      0      0       0       0      0      0      0      0   
...           ...    ...    ...     ...     ...    ...    ...    ...    ...   
18237.0         0      0      0       0       0      0      0      0      0   
18239.0         0      0      0       0       0      0      0      0      0   
18241.0         0      0      0       0       0      0      0      0      0   
18277.0         0      0      0       0       0      0      0      0      0   
18287.0         0      0      0       0       0     

In [17]:
user_user_sim_matrix.loc[12415.0].sort_values(ascending=False)
items_bought_by_A = customer_item_matrix.loc[12415.0][customer_item_matrix.loc[12415.0]>0]
print("Items Bought by A: ")
print(items_bought_by_A)

Items Bought by A: 
StockCode
20711    1
20727    1
20978    1
20979    1
21115    1
        ..
23534    1
23535    1
23536    1
23541    1
23542    1
Name: 12415.0, Length: 81, dtype: int64


In [18]:
items_bought_by_B = customer_item_matrix.loc[12362.0][customer_item_matrix.loc[12362.0]>0]
print("Items bought by B:")
print(items_bought_by_B)

print()

items_to_recommend_to_B = set(items_bought_by_A.index) - set(items_bought_by_B.index) ## This line calculates the items that were bought by user A (user with ID 12350.0) but have not been bought by user B.
print("Items to Recommend to B ")
print(items_to_recommend_to_B)
df.loc[df['StockCode'].isin(items_to_recommend_to_B),['StockCode', 'Description']].drop_duplicates().set_index('StockCode')

Items bought by B:
StockCode
20725    1
21561    1
22326    1
22328    1
22352    1
22382    1
22383    1
22629    1
22630    1
22631    1
22659    1
22662    1
23208    1
23209    1
POST     1
Name: 12362.0, dtype: int64

Items to Recommend to B 
{'22969', '22991', '23518', '23014', '22620', '23533', '22907', '23013', '22698', '22546', '22423', '20978', '23175', '22720', '23342', '23541', '22726', '23236', '22549', '23513', '22722', '23245', '22539', '23535', '23308', '23370', '22386', '22544', '23524', '23534', '23206', '21115', '23505', '23372', '23507', '23113', '23510', '23526', '23108', '22727', '22730', '21539', '23519', '22992', '23542', '22699', '21770', '23523', '22191', '23525', '23389', '23527', '22192', '20979', '23012', '22193', '23515', '23390', '23516', '21791', '23371', '23341', '23536', '22725', '22029', '22978', '22892', '23388', '23114', '22090', '20727', '21843', '22327', '22728', '23392', '23512', '23502', '20711'}


Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
22728,ALARM CLOCK BAKELIKE PINK
22727,ALARM CLOCK BAKELIKE RED
22726,ALARM CLOCK BAKELIKE GREEN
21791,VINTAGE HEADS AND TAILS CARD GAME
22544,MINI JIGSAW SPACEBOY
...,...
23524,WALL ART HORSE & PONY
23527,WALL ART ANIMALS AND NATURE
23526,WALL ART DOG LICENCE
23525,WALL ART BUFFALO BILL


# 

## Item-Based Collabrative Filtering

In [19]:
item_item_sim_matrix = pd.DataFrame(cosine_similarity(customer_item_matrix.T)) ## Item-Item based [Model]
item_item_sim_matrix.columns = customer_item_matrix.T.index

item_item_sim_matrix['StockCode'] = customer_item_matrix.T.index
item_item_sim_matrix = item_item_sim_matrix.set_index('StockCode')

In [20]:
print(item_item_sim_matrix)

StockCode        10002  10080     10120    10123C  10124G  10125  10133  \
StockCode                                                                 
10002         1.000000    0.0  0.353553  0.500000     0.0    0.0    0.0   
10080         0.000000    1.0  0.000000  0.000000     0.0    0.0    0.0   
10120         0.353553    0.0  1.000000  0.707107     0.0    0.0    0.0   
10123C        0.500000    0.0  0.707107  1.000000     0.0    0.0    0.0   
10124G        0.000000    0.0  0.000000  0.000000     1.0    0.0    0.0   
...                ...    ...       ...       ...     ...    ...    ...   
BANK CHARGES  0.000000    0.0  0.000000  0.000000     0.0    0.0    0.0   
C2            0.000000    0.0  0.000000  0.000000     0.0    0.0    0.0   
DOT           0.000000    0.0  0.000000  0.000000     0.0    0.0    0.0   
M             0.000000    0.0  0.000000  0.000000     0.0    0.0    0.0   
POST          0.065094    0.0  0.000000  0.000000     0.0    0.0    0.0   

StockCode       10135   

### Making Recomendation

In [36]:
def recommend(stock_Code):
    top_10_similar_items = list(item_item_sim_matrix.loc[stock_Code].sort_values(ascending=False).iloc[:10].index)

    data = []
    for stock_code in top_10_similar_items:
        item_info = df[df['StockCode'] == stock_code][['StockCode', 'Description']]
        if not item_info.empty:
            item = {
                'StockCode': stock_code,
                'Description': item_info['Description'].values[0]  # Assuming there's only one unique description per stock_code
            }
            data.append(item)

    return data


In [41]:
tempData = recommend('22756')

first_product = tempData[0]
stock_code = first_product['StockCode']
description = first_product['Description']
print(stock_code)
print(description)

10123C
HEARTS WRAPPING TAPE 


In [23]:
top_10_similar_items = list(item_item_sim_matrix.loc['10123C'].sort_values(ascending=False).iloc[:4].index)

print(top_10_similar_items)
print()
print(df.loc[
    df['StockCode'].isin(top_10_similar_items),
    ['StockCode', 'Description']
].drop_duplicates().set_index('StockCode').loc[top_10_similar_items])

['10123C', '90094', '22756', '22757']

                                  Description
StockCode                                    
10123C                  HEARTS WRAPPING TAPE 
90094      NECKLACE+BRACELET SET FRUIT SALAD 
22756         LARGE YELLOW BABUSHKA NOTEBOOK 
22757            LARGE RED BABUSHKA NOTEBOOK 


In [24]:
top_10_similar_items[0]

'10123C'

In [25]:
temp = item_item_sim_matrix.loc['10123C']
print(temp)

StockCode
10002           0.500000
10080           0.000000
10120           0.707107
10123C          1.000000
10124G          0.000000
                  ...   
BANK CHARGES    0.000000
C2              0.000000
DOT             0.000000
M               0.000000
POST            0.000000
Name: 10123C, Length: 2517, dtype: float64


In [26]:
productNames = df['Description'] ## List of all Products Stock
print(productNames)

0                        WHITE METAL LANTERN
1             CREAM CUPID HEARTS COAT HANGER
2        KNITTED UNION FLAG HOT WATER BOTTLE
3             RED WOOLLY HOTTIE WHITE HEART.
4               SET 7 BABUSHKA NESTING BOXES
                        ...                 
27684         SKULLS PARTY BAG + STICKER SET
27685       DINOSAUR PARTY BAG + STICKER SET
27686                PARTY INVITES DINOSAURS
27687      SET OF 3 WOODEN HEART DECORATIONS
27688      HEART WOODEN CHRISTMAS DECORATION
Name: Description, Length: 21545, dtype: object


In [27]:
import pickle  ## Genrated the artifacts, to use this information on the web
pickle.dump(item_item_sim_matrix,open('artifacts/model.pkl','wb'))


In [31]:
# Select only the 'StockCode' and 'Description' columns and drop duplicates
unique_stock_products = df[['StockCode', 'Description']].drop_duplicates()
unique_stock_products

Unnamed: 0,StockCode,Description
0,71053,WHITE METAL LANTERN
1,84406B,CREAM CUPID HEARTS COAT HANGER
2,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
3,84029E,RED WOOLLY HOTTIE WHITE HEART.
4,22752,SET 7 BABUSHKA NESTING BOXES
...,...,...
26621,23564,EGG CUP MILKMAID INGRID
27013,22804,PINK HANGING HEART T-LIGHT HOLDER
27399,22538,MINI JIGSAW GO TO THE FAIR
27521,23070,EDWARDIAN HEART PHOTO FRAME


In [31]:
pickle.dump(unique_stock_products,open('artifacts/productNames.pkl','wb'))