In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Loading of Data 

## Text Based - TF - IDF Approach (Representation)

In [None]:
df = pd.read_csv("/content/IMDB_top_1000.csv")

# Observing first 5 rows
df.head(5)

# Concatenate the Title, Genre, and Description into a single string
df['combined_features'] = df['Title'] + ' ' + df['Genre'] + ' ' + df['Description']


tfidf = TfidfVectorizer(stop_words='english')
tfidf_scores = tfidf.fit_transform(df['combined_features'])

# Shape of tfidf vector is (number of documents, number of words)
# TFIDF has identified 2363 words in our case
tfidf_scores.shape

#Looking at some of the IDF scores

idf_values = tfidf.idf_

# Get mapping from term to index
vocab_dict = tfidf.vocabulary_

# Get mapping from index to term
reverse_vocab = {}
for x in vocab_dict:
  reverse_vocab[vocab_dict[x]] = x

## Text Based - BERT (Representation)

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(df['combined_features'], show_progress_bar=True)


#Shape of BERT embeddings is (number of documents, 768)
embeddings.shape

## Autoencoders / NumericalFeatureEmbedding / OneHotEncoder = PyTorch

## Cosine Similarity 

In [None]:
"""
We precompute the similarity matrix, because during inference we simply need
index the matrix
"""

tfidf_cosine_sim = (tfidf_scores @ tfidf_scores.T).toarray()
bert_cosine_sim = cosine_similarity(embeddings)

#Shape of the similarity matrix (number of documents, number of documents)
bert_cosine_sim.shape, tfidf_cosine_sim.shape


In [None]:
def recommend_movies(data, movie_name, similarity_matrix, top_k=10):
  #Get the index of the movie from our database
  index = data.index[data['Title'] == movie_name][0]

  # Get the similarity scores of our movie with every other movie in the database
  score_arr = similarity_matrix[index]

  # We sort the score and reverse it to get the highest correlated movie
  # We do argsort here because we are interested in the indices.
  reveresed_score_arr = np.argsort(score_arr)[::-1]

  # Retrieve top K movies. We ignore the 0th element, because that contains the similarity score of our movie with itself
  top_k_movies = reveresed_score_arr[1:top_k + 1]

  return data.iloc[top_k_movies]

In [None]:
recommend_movies(df, "The Dark Knight Rises", tfidf_cosine_sim, 5)

In [None]:
recommend_movies(df, "The Dark Knight Rises", bert_cosine_sim, 5)

## LightFM

In [None]:
import numpy as np
import pandas as pd
from lightfm.data import Dataset

# Imaginary user data
users = pd.DataFrame({
    'user_id': [1, 2, 3, 4, 5],
    'age': [25, 30, 22, 35, 28],
    'country': ['US', 'UK', 'US', 'DE', 'CA']
})

# Imaginary song data
songs = pd.DataFrame({
    'song_id': [1, 2, 3, 4, 5],
    'title': ['Song A', 'Song B', 'Song C', 'Song D', 'Song E'],
    'artist': ['Artist 1', 'Artist 2', 'Artist 3', 'Artist 4', 'Artist 5'],
    'genre': ['Pop', 'Rock', 'Jazz', 'Pop', 'Classical']
})

# Imaginary interaction data
interactions = pd.DataFrame({
    'user_id': [1, 2, 1, 3, 4, 5, 5],
    'song_id': [1, 2, 3, 4, 2, 1, 5],
    'listen_count': [5, 2, 3, 1, 4, 6, 2]
})


In [None]:
# Create a dataset object
dataset = Dataset()
dataset.fit(
    users['user_id'],
    songs['song_id'],
    user_features=['age', 'country'],
    item_features=['title', 'artist', 'genre']
)

# Build the interaction matrix
(interactions_matrix, weights_matrix) = dataset.build_interactions(
    [(x[0], x[1], x[2]) for x in interactions.values]
)

# Build user and item features
user_features = dataset.build_user_features(
    (x[0], {'age': x[1], 'country': x[2]}) for x in users.values
)
item_features = dataset.build_item_features(
    (x[0], {'title': x[1], 'artist': x[2], 'genre': x[3]}) for x in songs.values
)


In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

# Initialize the model
model = LightFM(loss='warp')

# Train the model
model.fit(interactions_matrix, user_features=user_features, item_features=item_features, sample_weight=weights_matrix, epochs=30)

# Evaluate the model
train_precision = precision_at_k(model, interactions_matrix, user_features=user_features, item_features=item_features).mean()

print(f'Train Precision: {train_precision}')


## K Means Clustering -> PCA / tSNE

In [None]:
import pandas as pd
import numpy as np

# Example user data
users = pd.DataFrame({
    'user_id': [1, 2, 3, 4, 5],
    'age': [25, 30, 22, 35, 28],
    'country': ['US', 'UK', 'US', 'DE', 'CA']
})

# Example song data
songs = pd.DataFrame({
    'song_id': [1, 2, 3, 4, 5],
    'title': ['Song A', 'Song B', 'Song C', 'Song D', 'Song E'],
    'genre': ['Pop', 'Rock', 'Jazz', 'Pop', 'Classical'],
    'listen_count': [100, 200, 150, 250, 300]
})


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# One-hot encode string columns
ohe = OneHotEncoder()
encoded_users = ohe.fit_transform(users[['country']]).toarray()
encoded_songs = ohe.fit_transform(songs[['title', 'genre']]).toarray()

# Scale numerical columns
scaler = StandardScaler()
scaled_user_ages = scaler.fit_transform(users[['age']])
scaled_song_listen_counts = scaler.fit_transform(songs[['listen_count']])


In [None]:
user_features = np.hstack((scaled_user_ages, encoded_users))
song_features = np.hstack((scaled_song_listen_counts, encoded_songs))


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # Adjust n_components based on your needs
pca_user_features = pca.fit_transform(user_features)
pca_song_features = pca.fit_transform(song_features)


In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
tsne_user_features = tsne.fit_transform(pca_user_features)
tsne_song_features = tsne.fit_transform(pca_song_features)


In [None]:
from sklearn.cluster import KMeans

# Adjust n_clusters based on your needs
kmeans_users = KMeans(n_clusters=3).fit(tsne_user_features)
kmeans_songs = KMeans(n_clusters=3).fit(tsne_song_features)

# Assign clusters back to the dataframes
users['cluster'] = kmeans_users.labels_
songs['cluster'] = kmeans_songs.labels_


In [None]:
from sklearn.metrics import pairwise_distances_argmin_min

# New song features (after applying the same preprocessing, PCA, and t-SNE as before)
new_song_features = [...]  # This should be a 2D array after dimensionality reduction

# Find the closest cluster for the new song
closest_cluster, _ = pairwise_distances_argmin_min(new_song_features, kmeans_songs.cluster_centers_)

print(f"The new song belongs to cluster: {closest_cluster[0]}")


In [None]:
def recommend_songs_for_new_user(new_user_features, songs, kmeans_users, kmeans_songs):
    # Assuming new_user_features is preprocessed, PCA, and t-SNE applied
    # Assign the new user to a cluster
    closest_cluster, _ = pairwise_distances_argmin_min(new_user_features, kmeans_users.cluster_centers_)
    user_cluster = closest_cluster[0]
    
    # Find songs in the same cluster
    recommended_songs = songs[songs['cluster'] == user_cluster]
    
    return recommended_songs

# Example usage
new_user_features = [...]  # Preprocessed features of the new user
recommended_songs = recommend_songs_for_new_user(new_user_features, songs, kmeans_users, kmeans_songs)
print(recommended_songs)
