# Recommendation System Primary File

In [29]:
# Importing necessary libraries

# pandas is used for data manipulation and analysis, particularly for working with tabular data (like dataframes).
import pandas as pd

# numpy is a powerful library for numerical computations, especially for handling arrays and matrices of data.
import numpy as np

# matplotlib.pyplot is a plotting library used for creating static, animated, and interactive visualizations in Python.
import matplotlib.pyplot as plt

# seaborn is a data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.
import seaborn as sns

# sklearn's cosine_similarity function calculates the cosine similarity between vectors, often used in text analysis to measure how similar two documents are.
from sklearn.metrics.pairwise import cosine_similarity

# TfidfVectorizer is used to convert a collection of raw documents into a matrix of TF-IDF features, commonly used in text mining.
from sklearn.feature_extraction.text import TfidfVectorizer

# os is a standard Python library used for interacting with the operating system, such as handling file paths.
import os

# scipy's sparse module provides a way to work with sparse matrices, which are matrices that are mostly empty. coo_matrix is a type of sparse matrix in COOrdinate format.
from scipy.sparse import coo_matrix

# Import spacy module, used for generating tags from the data
import spacy

# Ignore all warnings to keep the output clean and focused
import warnings
warnings.filterwarnings('ignore')

In [30]:
# Load the dataset into a pandas DataFrame. This dataset contains product information.
train_data = pd.read_csv('../data/walmart_data.tsv', sep='\t')

# Filter the dataset to only include specific columns that are relevant for the recommendation system.
train_data = train_data[['Uniq Id','Product Id', 'Product Rating', 'Product Reviews Count', 'Product Category', 'Product Brand', 'Product Name', 'Product Image Url', 'Product Description', 'Product Tags']]

# Fill missing values in the dataset with default values (0 for numerical fields and empty strings for text fields) to avoid issues during analysis.
train_data['Product Rating'].fillna(0, inplace=True)
train_data['Product Reviews Count'].fillna(0, inplace=True)
train_data['Product Category'].fillna('', inplace=True)
train_data['Product Brand'].fillna('', inplace=True)
train_data['Product Description'].fillna('', inplace=True)

# Define a mapping of original column names to new, simplified names for easier reference.
column_name_mapping = {
    'Uniq Id': 'ID',
    'Product Id': 'ProdID',
    'Product Rating': 'Rating',
    'Product Reviews Count': 'ReviewCount',
    'Product Category': 'Category',
    'Product Brand': 'Brand',
    'Product Name': 'Name',
    'Product Image Url': 'ImageURL',
    'Product Description': 'Description',
    'Product Tags': 'Tags',
    'Product Contents': 'Contents'  # Note: 'Product Contents' is included in the mapping, but not present in the data filtering step above.
}

# Apply the column name mapping to the DataFrame to rename the columns.
train_data.rename(columns=column_name_mapping, inplace=True)

# Convert the 'ID' and 'ProdID' columns to numerical values for consistency and to facilitate operations like similarity calculations.
train_data['ID'] = pd.Categorical(train_data['ID']).codes.astype(float)
train_data['ProdID'] = pd.Categorical(train_data['ProdID']).codes.astype(float)

# Remove any duplicate rows from the DataFrame to ensure clean and accurate analysis.
train_data.drop_duplicates(inplace=True)

In [31]:
# Save the data to a csv file 
train_data.to_csv('../data/clean_data.csv',index=False)

### Rating-Based Recommendation System

In [24]:
# Calculate the average rating for each product, grouped by Name, ReviewCount, Brand, and ImageURL.
average_ratings = train_data.groupby(['Name','ReviewCount','Brand','ImageURL'])['Rating'].mean().reset_index()

# Sort the products based on their average rating in descending order to identify the top-rated items.
top_rated_items = average_ratings.sort_values(by='Rating', ascending=False)

# Extract the top 10 highest-rated products to generate a list of recommended items.
rating_base_recommendation = top_rated_items.head(10)

# Convert the 'Rating' and 'ReviewCount' columns to integer types to ensure consistency in data types.
rating_base_recommendation['Rating'] = rating_base_recommendation['Rating'].astype(int)
rating_base_recommendation['ReviewCount'] = rating_base_recommendation['ReviewCount'].astype(int)

# Display the top 10 rating-based recommendations.
rating_base_recommendation

Unnamed: 0,Name,ReviewCount,Brand,ImageURL,Rating
1686,"Electric Shaver, Triple Shaving Time Electric ...",4,Moosoo,https://i5.walmartimages.com/asr/e7dcd553-90df...,5
526,"Alaffia Body Lotion, Vanilla, 32 Oz",2,Alaffia,https://i5.walmartimages.com/asr/2988c323-cb6f...,5
2053,"Gold Bond Ultimate Ultimate Healing Lotion, Al...",2,Gold Bond,https://i5.walmartimages.com/asr/34b610e7-05db...,5
4716,"Versace Man Eau Fraiche Eau De Toilette Spray,...",24,Versace,https://i5.walmartimages.com/asr/edaaeed5-9da0...,5
2058,Goldwell StyleSign 1 Flat Marvel Straightening...,2,Goldwell,https://i5.walmartimages.com/asr/3bf90289-6980...,5
3842,"Red Devil 0322 Steel Wool # 00 Very Fine, 8 Pa...",1,Red Devil,https://i5.walmartimages.com/asr/60bfe5ba-774c...,5
510,"Air Wick Plug in Starter Kit, Warmer + 1 Refil...",1,Air Wick,https://i5.walmartimages.com/asr/0fac65b2-c6aa...,5
3841,Recovery Complex Anti-Frizz Shine Serum by Bai...,4,Bain de Terre,https://i5.walmartimages.com/asr/fcdb4d2e-3727...,5
2687,Long Aid Extra Dry Formula Curl Activator Gel ...,12,Long Aid,https://i5.walmartimages.com/asr/f7f29199-bfa5...,5
2062,"Good Sense 60-Day Air Care System, Citrus, 2 o...",1,Diversey,https://i5.walmartimages.com/asr/025a7068-7bb1...,5


In [28]:
# Save the results of Rating based recommendations to a csv file 
rating_base_recommendation.to_csv('../data/rbr.csv',index=False)

### Content-Based Recommendation System

In [25]:

# Define a function to generate content-based recommendations for a given item.
def content_based_recommendations(train_data, item_name, top_n=10):
    # Check if the specified item exists in the dataset. If not, return an empty DataFrame.
    if item_name not in train_data['Name'].values:
        print(f"Item '{item_name}' not found in the training data.")
        return pd.DataFrame()

    # Create a TF-IDF vectorizer to convert the 'Tags' text data into a matrix of TF-IDF features.
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')

    # Apply the TF-IDF vectorization to the 'Tags' column to create a matrix of TF-IDF features.
    tfidf_matrix_content = tfidf_vectorizer.fit_transform(train_data['Tags'])

    # Calculate the cosine similarity between items based on their TF-IDF vectors (i.e., how similar their tags are).
    cosine_similarities_content = cosine_similarity(tfidf_matrix_content, tfidf_matrix_content)

    # Find the index of the specified item in the dataset.
    item_index = train_data[train_data['Name'] == item_name].index[0]

    # Retrieve the cosine similarity scores for the specified item relative to all other items.
    similar_items = list(enumerate(cosine_similarities_content[item_index]))

    # Sort the items by similarity score in descending order (most similar items first).
    similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    # Select the top N most similar items, excluding the item itself.
    top_similar_items = similar_items[1:top_n+1]

    # Extract the indices of the top similar items.
    recommended_item_indices = [x[0] for x in top_similar_items]

    # Retrieve the details of the top similar items (Name, ReviewCount, Brand, ImageURL, Rating) from the dataset.
    recommended_items_details = train_data.iloc[recommended_item_indices][['Name', 'ReviewCount', 'Brand', 'ImageURL', 'Rating']]

    # Convert columns to appropriate types
    recommended_items_details['Rating'] = recommended_items_details['Rating'].astype(int)
    recommended_items_details['ReviewCount'] = recommended_items_details['ReviewCount'].astype(int)

    # Return the details of the recommended items.
    return recommended_items_details

# Example: Generate content-based recommendations for a specific item.
item_name = 'OPI Infinite Shine, Nail Lacquer Nail Polish, Bubble Bath'
content_based_rec = content_based_recommendations(train_data, item_name, top_n=8)

# Display the content-based recommendations.
content_based_rec

Unnamed: 0,Name,ReviewCount,Brand,ImageURL,Rating
4854,"OPI Infinite Shine Nail Polish, Dulce De Leche...",1.0,OPI,https://i5.walmartimages.com/asr/466527fe-9e5e...,5.0
3052,"OPI Nail Lacquer Nail Polish, Hawaiian Orchid",1.0,OPI,https://i5.walmartimages.com/asr/cef5a3ca-dfed...,4.0
4010,"OPI Nail Polish, Are We There Yet?, 0.5 Fl Oz",2.0,OPI,https://i5.walmartimages.com/asr/bb0c5e20-0ead...,3.0
4752,"OPI Infinite Shine Nail Polish, Let Love Spark...",3.0,OPI,https://i5.walmartimages.com/asr/7eef48ac-3a28...,4.0
4356,"OPI Nail Lacquer Nail Polish, Go with the Lava...",1.0,OPI,https://i5.walmartimages.com/asr/6cd23e3d-f3be...,5.0
3967,"OPI Infinite Shine Nail Polish, Mini Scotland ...",1.0,OPI,https://i5.walmartimages.com/asr/b673df9d-cc73...,0.0
2952,"OPI Nail Lacquer Nail Polish, Miami Beet",0.0,OPI,https://i5.walmartimages.com/asr/083f9170-da65...,0.0
4966,OPI Nail GelColor + Infinite Shine Polish -TAU...,0.0,OPI,https://i5.walmartimages.com/asr/a364a6d7-0cd1...,0.0


### Collaborative Filtering Recommendation System

In [26]:

# Define a function to generate collaborative filtering recommendations for a given user.
def collaborative_filtering_recommendations(train_data, target_user_id, top_n=10):
    # Create a user-item matrix where rows represent users and columns represent products, with ratings as values.
    user_item_matrix = train_data.pivot_table(index='ID', columns='ProdID', values='Rating', aggfunc='mean').fillna(0)

    # Normalize the user-item matrix by subtracting the mean rating for each user (optional but improves accuracy).
    user_item_matrix = user_item_matrix.subtract(user_item_matrix.mean(axis=1), axis=0)

    # Calculate the user similarity matrix using cosine similarity, comparing users based on their rating patterns.
    user_similarity = cosine_similarity(user_item_matrix)

    # Find the index of the target user in the user-item matrix.
    target_user_index = user_item_matrix.index.get_loc(target_user_id)

    # Retrieve the similarity scores for the target user relative to all other users.
    user_similarities = user_similarity[target_user_index]

    # Sort the users by similarity in descending order, excluding the target user themselves.
    similar_users_indices = user_similarities.argsort()[::-1][1:]

    # Initialize an empty set to store recommended items.
    recommended_items = set()

    # Iterate over the most similar users to identify items they rated that the target user has not yet rated.
    for user_index in similar_users_indices:
        if len(recommended_items) >= top_n:
            break

        # Get items rated by the similar user but not by the target user.
        rated_by_similar_user = user_item_matrix.iloc[user_index]
        not_rated_by_target_user = (rated_by_similar_user > 0) & (user_item_matrix.iloc[target_user_index] == 0)

        # Add the item IDs of recommended items to the set.
        recommended_items.update(user_item_matrix.columns[not_rated_by_target_user])

    # Retrieve the details of the recommended items (Name, ReviewCount, Brand, ImageURL, Rating) from the dataset.
    recommended_items_details = train_data[train_data['ProdID'].isin(recommended_items)][['Name', 'ReviewCount', 'Brand', 'ImageURL', 'Rating']]

    # Convert columns to appropriate types
    recommended_items_details['Rating'] = recommended_items_details['Rating'].astype(int)
    recommended_items_details['ReviewCount'] = recommended_items_details['ReviewCount'].astype(int)
    
    # Return the details of the top N recommended items.
    return recommended_items_details.head(top_n)

# Example: Generate collaborative filtering recommendations for a specific user.
target_user_id = 4
top_n = 5
collaborative_filtering_rec = collaborative_filtering_recommendations(train_data, target_user_id)

# Display the top N recommendations for the target user.
print(f"Top {top_n} recommendations for User {target_user_id}:")
collaborative_filtering_rec

Top 5 recommendations for User 4:


Unnamed: 0,Name,ReviewCount,Brand,ImageURL,Rating
68,Tinactin Athletes Foot Antifungal Treatment Cr...,5.0,Tinactin,https://i5.walmartimages.com/asr/61d435bc-333b...,4.6
332,"Hard Candy Single & Loving it Eye Shadow, Brn ...",6.0,Hard Candy,https://i5.walmartimages.com/asr/08146799-9538...,3.8
1098,Shikai - Borage Therapy Dry Skin Lotion Origin...,4.0,ShiKai,https://i5.walmartimages.com/asr/e3f4fc17-a5a7...,5.0
1338,"Herbal Essences Shampoo & Conditioner, Hello H...",2732.0,Herbal Essences,https://i5.walmartimages.com/asr/7e55725d-fce5...,4.5
1667,"e.l.f. Day to Night Lipstick Duo, Need It Nudes",271.0,e.l.f. Cosmetics,https://i5.walmartimages.com/asr/9c17d9b0-35da...,4.2
2140,V76 by Vaughn Brightening Conditioner For Silv...,10.0,V76 by Vaughn,https://i5.walmartimages.com/asr/611ac2d0-39ef...,4.9
2275,Bigen Permanent Powder Hair Color 96 Deep Burg...,1.0,Bigen,https://i5.walmartimages.com/asr/c4ee6765-9e03...,2.0
2292,"Softsoap Moisturizing Body Wash, Luminous Oils...",448.0,Softsoap,https://i5.walmartimages.com/asr/a1d1536f-15d3...,4.7
2553,LOreal Paris True Match Super-Blendable Concea...,113.0,L'Oreal Paris,https://i5.walmartimages.com/asr/6568f252-55c2...,4.4
4202,Redken-Diamond Oil Glow Dry Detangling Conditi...,334.0,Redken,https://i5.walmartimages.com/asr/f200009c-1f8f...,4.5


### Hybrid Recommendation System

In [27]:
# Define a function to generate hybrid recommendations by combining content-based and collaborative filtering methods.
def hybrid_recommendations(train_data, target_user_id, item_name, top_n=10):
    # Generate content-based recommendations for the specified item.
    content_based_rec = content_based_recommendations(train_data, item_name, top_n)

    # Generate collaborative filtering recommendations for the specified user.
    collaborative_filtering_rec = collaborative_filtering_recommendations(train_data, target_user_id, top_n)
    
    # Combine the two sets of recommendations and remove duplicates.
    hybrid_rec = pd.concat([content_based_rec, collaborative_filtering_rec]).drop_duplicates()
    
    # Return the top N hybrid recommendations.
    return hybrid_rec.head(10)

# Example: Generate hybrid recommendations for a specific user and item.
target_user_id = 4  # Specify the user ID
item_name = "OPI Nail Lacquer Polish .5oz/15mL - This Gown Needs A Crown NL U11"  # Specify the item name
hybrid_rec = hybrid_recommendations(train_data, target_user_id, item_name, top_n=10)

# Display the top 10 hybrid recommendations for the user and item.
print(f"Top 10 Hybrid Recommendations for User {target_user_id} and Item '{item_name}':")
hybrid_rec

Top 10 Hybrid Recommendations for User 4 and Item 'OPI Nail Lacquer Polish .5oz/15mL - This Gown Needs A Crown NL U11':


Unnamed: 0,Name,ReviewCount,Brand,ImageURL,Rating
1215,OPI Nail Lacquer Polish .5oz/15mL - Put A Coat...,0.0,OPI,https://i5.walmartimages.com/asr/b51a716d-565c...,0.0
545,OPI Nail Lacquer Polish .5oz/15mL - Sweet Hear...,0.0,OPI,https://i5.walmartimages.com/asr/136eb3a3-1668...,0.0
3852,OPI Nail Lacquer Polish .5oz/15mL - A-taupe th...,1.0,OPI,https://i5.walmartimages.com/asr/4b895a60-06c2...,0.0
1594,Nail Lacquer - # NL E74 Youre Such a BudaPest ...,5.0,OPI,https://i5.walmartimages.com/asr/bb6e42b8-35ee...,5.0
4645,OPI Nail Lacquer Polish .5oz/15mL - Champagne ...,0.0,OPI,https://i5.walmartimages.com/asr/5f4ec827-6edc...,0.0
4109,OPI Nail Lacquer Polish .5oz/15mL - Girls Love...,0.0,OPI,https://i5.walmartimages.com/asr/6d70f6db-0442...,0.0
4310,OPI Nail Lacquer Polish .5oz/15mL - So Many Cl...,0.0,OPI,https://i5.walmartimages.com/asr/b24090e9-9cf9...,0.0
3719,OPI Nail Lacquer Polish .5oz/15mL - Ski Slope ...,0.0,OPI,https://i5.walmartimages.com/asr/8466cc9b-40da...,0.0
4010,"OPI Nail Polish, Are We There Yet?, 0.5 Fl Oz",2.0,OPI,https://i5.walmartimages.com/asr/bb0c5e20-0ead...,3.0
3052,"OPI Nail Lacquer Nail Polish, Hawaiian Orchid",1.0,OPI,https://i5.walmartimages.com/asr/cef5a3ca-dfed...,4.0
