In [220]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [221]:
# load the data
data = pd.read_csv('../data/merged_data.csv')

In [222]:
data.head()

Unnamed: 0,user_id,first_name,last_name,email,age,gender,postcode,country,item_id,action,brand,product_name,eye_size,lens_color,price_range,polarized_glasses,prescribed_glasses
0,8356259,James,Salas,James.Salas.DDS@ross.net,42,Male,3720,Australia,43796976,added_to_cart,Versace,1739/F/SK,56,Clear,medium,No,Yes
1,6612383,Rebecca,Patterson,Rebecca.Patterson@wade.net,61,Male,3059,Australia,96224516,viewed,Prada,PRA08V,50,Clear,medium,No,Yes
2,3844909,Meagan,Kirk,Meagan.Kirk@carey-manning.org,69,Female,2582,Australia,46937700,purchased,Ray-Ban,Kat RX7327,56,Clear,medium,No,Yes
3,3094472,Leah,Mitchell,Leah.Mitchell@fisher.com.au,33,Female,4892,Australia,68796712,added_to_cart,Dolce & Gabbana,PR15WS,57,Clear,medium,No,No
4,6234966,Katie,Richards,Katie.Richards@wilson.biz,20,Female,6323,Australia,63577055,added_to_cart,Valley,VE3338,61,Dark Green Polarised,high,No,Yes


In [223]:
data.shape

(97814, 17)

In [224]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97814 entries, 0 to 97813
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             97814 non-null  int64 
 1   first_name          97814 non-null  object
 2   last_name           97814 non-null  object
 3   email               97814 non-null  object
 4   age                 97814 non-null  int64 
 5   gender              97814 non-null  object
 6   postcode            97814 non-null  int64 
 7   country             97814 non-null  object
 8   item_id             97814 non-null  int64 
 9   action              97814 non-null  object
 10  brand               97814 non-null  object
 11  product_name        97814 non-null  object
 12  eye_size            97814 non-null  int64 
 13  lens_color          97814 non-null  object
 14  price_range         97814 non-null  object
 15  polarized_glasses   97814 non-null  object
 16  prescribed_glasses  97

In [225]:
data.eye_size.unique()

array([56, 50, 57, 61, 51, 55, 54, 58, 62, 52, 53, 59, 60, 48])

In [226]:
new_data = data.sample(10000).reset_index(drop=True)

In [227]:
new_data['tags'] = new_data.apply(lambda row: f"{row['age']} {row['gender']} {row['item_id']} {row['brand']} {row['product_name']} {row['eye_size']} {row['lens_color']} {row['price_range']} {row['polarized_glasses']} {row['prescribed_glasses']} {row['action']}", axis=1)


In [228]:
new_df = new_data[['user_id', 'tags']]

In [229]:
new_df.head()

Unnamed: 0,user_id,tags
0,4304335,52 Female 35245781 Persol RB2206D 52 Clear med...
1,1770875,60 Female 19737001 Oakley PR02ZV 53 Dark Brown...
2,1594668,63 Male 76512214 Oakley VE3339U 53 Prizm Black...
3,4758511,70 Male 45682106 Ray-Ban Ritual 61 Black Gradi...
4,2093188,29 Female 94587352 Isabel Marant BE1386 57 Cle...


In [230]:
# preprocess the tags column
def preprocess_text(text):
    try:
        # Convert to lowercase
        text = text.lower()

        # lemmatizer
        lemmatizer = WordNetLemmatizer()

        # Define stopwords
        stop_words = set(stopwords.words('english'))
        
        # Remove special characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        
        # Tokenize the text
        tokens = word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        
        # Join tokens back into a single string
        processed_text = ' '.join(tokens)
        
        return processed_text
    
    except Exception as e:
        print(e)
        return None

In [231]:
# Apply the preprocessing function to the 'tags' column
new_df['tags'] = new_df['tags'].apply(preprocess_text)

In [232]:
new_df.shape

(10000, 2)

In [233]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  10000 non-null  int64 
 1   tags     10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [234]:
new_df.head()

Unnamed: 0,user_id,tags
0,4304335,52 female 35245781 persol rb2206d 52 clear med...
1,1770875,60 female 19737001 oakley pr02zv 53 dark brown...
2,1594668,63 male 76512214 oakley ve3339u 53 prizm black...
3,4758511,70 male 45682106 rayban ritual 61 black gradie...
4,2093188,29 female 94587352 isabel marant be1386 57 cle...


In [235]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50)
tfidf_matrix = vectorizer.fit_transform(new_df['tags'])

In [236]:
similarity = cosine_similarity(tfidf_matrix)

In [237]:
similarity[10]

array([0.35848546, 0.25041313, 0.30656055, ..., 0.37047706, 0.21690194,
       0.06178817])

In [254]:
# extract the top 5 similar glasses for provided user_id
def get_top_similar_glasses(user_id: int, similarity, top_n=5):
    # Check if user_id exists in the new_data
    if user_id not in new_data['user_id'].values:
        return f"User ID {user_id} not found in the data."
    
    # Get the index of the user_id
    idx = new_data[new_data['user_id'] == user_id].index[0]
    
    # Get the similarity scores of the user_id with the products they have interacted with
    similar_glasses = list(enumerate(similarity[idx]))

    # Sort the similar glasses based on the similarity scores
    similar_glasses = sorted(similar_glasses, key=lambda x: x[1], reverse=True)

    # Get the top 5 similar glasses
    top_similar_glasses = similar_glasses[1:top_n+1]

    # Get the product details like brand name, product_name, frame_color, lens_color, price_range
    product_details = []
    for i in top_similar_glasses:
        idx = i[0]
        processed_tags = new_data.loc[idx, ['item_id', 'brand', 'product_name', 'lens_color', 'price_range', 'eye_size', 'polarized_glasses', 'prescribed_glasses']]
        product_details.append(processed_tags)
        
    return product_details



In [257]:
recommendations= get_top_similar_glasses(1243730, similarity, top_n=10)

recommendations = pd.DataFrame(recommendations, columns=['item_id', 'brand', 'product_name', 'lens_color', 'price_range', 'eye_size', 'polarized_glasses', 'prescribed_glasses'])

In [258]:
recommendations

Unnamed: 0,item_id,brand,product_name,lens_color,price_range,eye_size,polarized_glasses,prescribed_glasses
8654,49452138,Ray-Ban,Mr Burbank RX5383,Green Polarised Glass,high,50,No,No
5464,72385789,Ray-Ban,New Clubmaster RX7216,Green Glass Polarised,high,59,No,No
9907,35813480,Ray-Ban,RB4435,Green Polarised Glass,medium,59,Yes,No
4330,41905800,Ray-Ban,PRA09S,Green Vintage Gradient Glass,high,59,No,Yes
8418,32301501,Ray-Ban,Feedback,Green Glass,medium,59,Yes,No
633,12367608,Ray-Ban,PS07PV,Green Glass,medium,59,No,No
8193,97914656,Ray-Ban,Phoxer AN4338,Green Polarised,medium,59,No,No
5770,15548150,Ray-Ban,RX7228,Dark Green Polarised,medium,59,No,Yes
5120,59612289,Ray-Ban,Bain Bridge RB3735,Green Glass,medium,54,No,No
4985,48376464,Ray-Ban,RB4435,Green Polarised Glass,medium,59,No,No


In [267]:
data[(data['user_id'] == 1243730)].sort_values(by=['action', 'brand'], ascending=False).head(50)

Unnamed: 0,user_id,first_name,last_name,email,age,gender,postcode,country,item_id,action,brand,product_name,eye_size,lens_color,price_range,polarized_glasses,prescribed_glasses
38594,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,13619038,viewed,Ray-Ban,PR26ZS,56,Green Polarised Glass,high,Yes,No
48012,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,32301501,viewed,Ray-Ban,PRA03V,56,Green Glass,high,Yes,No
32270,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,90032079,viewed,Prada,Shelton,56,555nm Photochromic Polarised Glass,high,No,No
94253,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,19415501,viewed,Prada,BE1386,56,Clear,medium,No,No
54733,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,67528194,viewed,Oakley,PRA59S,55,Brown,medium,No,No
32223,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,62519067,viewed,Michael Kors,Bel Air MK2209U,54,Brown Mirror,high,Yes,No
45972,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,93797832,viewed,Michael Kors,Dubai MK2211U,55,Brown Gradient Polarised,medium,No,No
28649,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,48376464,purchased,Ray-Ban,PR26ZS,56,Clear,high,No,No
56434,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,77471253,purchased,Ray-Ban,PRA03V,56,Green Glass,high,Yes,No
78122,1243730,Rhonda,James,Rhonda.James@tapia.com,42,Female,6927,Australia,57909652,purchased,Ray-Ban,GG1680S,54,Clear,high,Yes,No
