In [24]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity


# Download stopwords if not already present
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
df = pd.read_csv("small dataset.csv")  # Replace with actual file path

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english') and len(word) > 2]  # Remove stopwords and short words
    return ' '.join(tokens)
print(df['rating'])

# Apply preprocessing
df["cleaned_review"] = df["review"].astype(str).apply(preprocess_text)



0     5
1     4
2     3
3     4
4     5
5     3
6     5
7     2
8     4
9     5
10    3
11    4
12    5
13    3
14    4
Name: rating, dtype: int64


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bough\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bough\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_review"])

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_review"])

# Extract topics using LDA
num_topics = 5  # Adjust based on dataset
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
item_topics = lda_model.fit_transform(X)  # Topic distribution per item

df["topic_distribution"] = list(item_topics)  # Store topic distributions


# Display topics
def display_topics(model, feature_names, num_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}: ", " ".join([feature_names[i] for i in topic.argsort()[-num_words:]]))

feature_names = vectorizer.get_feature_names_out()
display_topics(lda_model, feature_names)

Topic 0:  quality battery life short great sound cancellation noise headphones excellent
Topic 1:  good deep learning technical bit introduction keeps warm winter jacket
Topic 2:  written works design could better informative hard follow chapters well
Topic 3:  recommend material fit songs feel beats repetitive comfortable durable fabric
Topic 4:  songs every moment masterpiece loved predictable cinematography plot overall style


In [42]:
# Aggregate topic distributions per item (item profiles)
item_profiles = df.groupby("item_id")["topic_distribution"].apply(lambda x: np.mean(np.vstack(x), axis=0)).reset_index()

# Function to get user profile based on rated item profiles
def get_user_profile(user_id):
    user_data = df[df["user_id"] == user_id].merge(item_profiles, on="item_id", how="left")
    liked = user_data[user_data["rating"] >= 4]["topic_distribution_y"]
    disliked = user_data[user_data["rating"] <= 2]["topic_distribution_y"]
    
    if not liked.empty:
        liked_profile = np.mean(np.vstack(liked), axis=0)
    else:
        liked_profile = np.zeros(num_topics)
    
    if not disliked.empty:
        disliked_profile = np.mean(np.vstack(disliked), axis=0)
    else:
        disliked_profile = np.zeros(num_topics)
    
    return liked_profile, disliked_profile

# Function to recommend items based on item profiles
def recommend_items(user_id, top_n=5):
    liked_profile, disliked_profile = get_user_profile(user_id)
    
    item_profiles["similarity"] = item_profiles["topic_distribution"].apply(
        lambda x: cosine_similarity([x], [liked_profile])[0][0] - cosine_similarity([x], [disliked_profile])[0][0]
    )
    
    recommendations = item_profiles.sort_values(by="similarity", ascending=False).head(top_n)
    return recommendations[["item_id", "similarity"]]

# Example: Get recommendations for user 1
user_id = 1
print(recommend_items(user_id))



   item_id  similarity
1      102    0.886484
0      101    0.885546
3      104    0.671988
2      103    0.577523
4      105    0.410543


In [40]:
print(get_user_profile(1))

(array([0.29156684, 0.18039232, 0.40349133, 0.06226194, 0.06228757]), array([0.06516561, 0.06528794, 0.29540629, 0.0652751 , 0.50886507]))
