In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from scipy.sparse import hstack
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# 1. Preprocess Data
df_interactions['review'] = df_interactions['review'].fillna('').astype(str)
sia = SentimentIntensityAnalyzer()
df_interactions['sentiment'] = df_interactions['review'].apply(lambda x: sia.polarity_scores(x)['compound'])
df_combined = pd.merge(df_recipes, df_interactions, left_on='id', right_on='recipe_id')


In [None]:
# 2. Feature Extraction with TF-IDF
tfidf = TfidfVectorizer()
X_description = tfidf.fit_transform(df_combined['description'].fillna(''))
X_sentiment_rating = StandardScaler().fit_transform(df_combined[['sentiment', 'rating']])
X_combined = hstack([X_description, X_sentiment_rating])

In [None]:
# 3. Optimal Clustering with Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_combined)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of clusters (k)")
plt.ylabel("WCSS")
plt.show()

In [None]:
# 4. optimal number of clusters
optimal_k = 7
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df_combined['cluster'] = kmeans.fit_predict(X_combined)


In [None]:
# 5. Visualizing Cluster Summary
cluster_summary = df_combined.groupby('cluster').agg({
    'tags': lambda x: x.mode()[0],
    'ingredients': lambda x: x.mode()[0]
}).reset_index()

sentiment_rating_summary = df_combined.groupby('cluster').agg({
    'sentiment': 'mean',
    'rating': 'mean'
}).reset_index()

cluster_summary = pd.merge(cluster_summary, sentiment_rating_summary, on='cluster')

In [None]:
# 6. Sentiment vs. Rating by Cluster
plt.figure(figsize=(10, 6))
sns.scatterplot(data=cluster_summary, x='sentiment', y='rating', hue='cluster', palette='viridis')
plt.title("Sentiment vs Rating Across Clusters")
plt.xlabel("Average Sentiment")
plt.ylabel("Average Rating")
plt.legend(title="Cluster")
plt.show()