# Content-based recommendation: TF-IDF and LDA

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize


In [4]:
ratings_path = r"C:\Users\diele\OneDrive\Área de Trabalho\Recomendação Inteligente\Base de dados\XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv"
wines_path = r"C:\Users\diele\OneDrive\Área de Trabalho\Recomendação Inteligente\Base de dados\XWines_Slim_1K_wines_150K_ratings\XWines_Slim_1K_wines.csv"

wines = pd.read_csv(wines_path)
ratings = pd.read_csv(ratings_path, low_memory=False)

## Organizing data

- Merge the datasets
- Clean attributes
- Create a new attribute named "Description"

In [25]:
# Merge

wines_ratings = pd.merge(wines, ratings, on='WineID')

# Cleaning the attributes "Grapes" and "Harmonize", as there is more than one word in them

grape = wines_ratings["Grapes"].str.split(",",expand=True).stack().str.replace("[","").str.replace("]","").str.strip().reset_index(level=1,drop=True).rename("Grape")
harm = wines_ratings["Harmonize"].str.split(",",expand=True).stack().str.replace("[","").str.replace("]","").str.strip().reset_index(level=1,drop=True).rename("Harmonize1")


wines_ratings = wines_ratings.join(grape)
wines_ratings = wines_ratings.join(harm)

# Creating attribute "Description"
wines_ratings["Description"] = (wines_ratings["WineName"] + " " +
                                wines_ratings["Type"] + " " +
                                wines_ratings["Elaborate"] + " " +
                                wines_ratings["Grape"] + " " +
                                wines_ratings["Body"] + " " +
                                wines_ratings["Acidity"] + " " +
                                wines_ratings["Country"] + " " +
                                wines_ratings["RegionName"] +"Harmoniza com " +
                                wines_ratings["Harmonize1"] 

descriptions = wines_ratings["Description"]


In [34]:
wines_ratings.head(2).T

Unnamed: 0,0,0.1
WineID,100001,100001
WineName,Espumante Moscatel,Espumante Moscatel
Type,Sparkling,Sparkling
Elaborate,Varietal/100%,Varietal/100%
Grapes,['Muscat/Moscato'],['Muscat/Moscato']
Harmonize,"['Pork', 'Rich Fish', 'Shellfish']","['Pork', 'Rich Fish', 'Shellfish']"
ABV,7.5,7.5
Body,Medium-bodied,Medium-bodied
Acidity,High,High
Code,BR,BR


## TF - IDF

In [33]:
tfidfVectorizer = TfidfVectorizer(stop_words=["english","portuguese"]) # stop words = eliminates unimportant words 
tfidfMatrix = tfidfVectorizer.fit_transform(descriptions)
tfidfMatrix.shape

(1010887, 2397)

## LDA

In [35]:
ldaVectorizer = CountVectorizer(stop_words=["english","portuguese"])
word_count = ldaVectorizer.fit_transform(descriptions)
word_count.shape

(1010887, 2397)

In [36]:
# implementing LDA model

lda = LatentDirichletAllocation(n_components=10, random_state=0)
#lda.fit(word_count)

In [38]:
#Save the model
#pickle.dump(lda, open('lda_model.pkl','wb'))

#Load the model
lda=pickle.load(open('lda_model.pkl','rb')

In [39]:
# Normalize the matrix

tfidf_norm = normalize(tfidfMatrix)
lda_norm = normalize(lda.transform(word_count))

# Dimensions check

print("tf-idf matrix dimensions:",tfidf_norm.shape)
print("lda matrix dimensions:",lda_norm.shape)

tf-idf matrix dimensions: (1010887, 2397)
lda matrix dimensions: (1010887, 10)


## Content Profile

In [None]:
# We need to convert the sparse tfidf_norm matrix to a dense numpy array

tdidf_norm_dense = tfidf_norm.toarray()

# Array concatenation

content_profile = np.concatenate((tdidf_norm_dense,lda_norm),axis=1)

print(content_profile.shape)

## User Profile

In [44]:
user_id = 1756594

#filtering ratings

user_ratings = wines_ratings[wines_ratings["UserID"] == user_id] 

#highly rated wines (by the user)

highly_ratings = user_ratings[user_ratings["Rating"] > 4]

# Extracting the descriptions of these wines
highly_descriptions = highly_ratings["Description"]

#Using tf-idf and lda you can transform descriptions into vectors
tfidf_user = tfidfVectorizer.transform(highly_descriptions)
lda_user = lda.transform(ldaVectorizer.transform(highly_descriptions))

#normalize
tfidf_user_norm = normalize(tfidf_user)
lda_user_norm = normalize(lda_user)

#User Profile

user_profile = np.mean(np.concatenate((tfidf_user_norm.toarray(),lda_user_norm),axis=1),axis=0)



In [None]:
user_profile

array([0.        , 0.        , 0.        , ..., 0.00626199, 0.00626199,
       0.00626164])

## Recommendation

In [None]:
# Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity([user_profile],content_profile)

# The 10 best wines to recommend
n = 10
index = np.argsort(similarity[0])[::-1][:n]

recommendations = wines_ratings.iloc[index]
recommendations = recommendations.drop_duplicates(subset=["WineID"])

print(recommendations[["WineID","WineName","WineryName","Type","Country"]])