Unsupervised Learning and Recommendation Systems

In [1]:
# Unsupervised Learning and Recommendation Systems

# Install required packages (uncomment if needed)
# !pip install pandas numpy scikit-learn matplotlib seaborn umap-learn

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import zipfile, requests, io


  from .autonotebook import tqdm as notebook_tqdm


 Mall Customer Segmentation Dataset

In [2]:
# Load and preprocess dataset
mall_url = "https://raw.githubusercontent.com/tanishq21/Mall-Customers/main/Mall_Customers.csv"
mall_df = pd.read_csv(mall_url)


mall_features = mall_df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
scaler = StandardScaler()
mall_scaled = scaler.fit_transform(mall_features)


 Dimensionality Reduction


In [3]:
# PCA
pca = PCA(n_components=2)
mall_pca = pca.fit_transform(mall_scaled)

# t-SNE
tsne = TSNE(n_components=2, random_state=42)
mall_tsne = tsne.fit_transform(mall_scaled)

# UMAP
umap_model = umap.UMAP(n_components=2, random_state=42)
mall_umap = umap_model.fit_transform(mall_scaled)


  warn(


 Clustering

In [4]:
# K-Means
mall_df['KMeans'] = KMeans(n_clusters=5, random_state=42).fit_predict(mall_scaled)

# Hierarchical
mall_df['Hierarchical'] = AgglomerativeClustering(n_clusters=5).fit_predict(mall_scaled)

# DBSCAN
mall_df['DBSCAN'] = DBSCAN(eps=0.5, min_samples=5).fit_predict(mall_scaled)


MovieLens Dataset

In [5]:
import pandas as pd

# Load the CSV files directly from current working directory
ratings = pd.read_csv("rating.csv")
movies = pd.read_csv("movie.csv")

# Preview the data
print("Ratings sample:")
print(ratings.head())

print("\nMovies sample:")
print(movies.head())


Ratings sample:
   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40

Movies sample:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


Collaborative Filtering

In [8]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load ratings
ratings = pd.read_csv("rating.csv")

# Filter top 1000 users and top 1000 movies by activity
top_users = ratings['userId'].value_counts().head(1000).index
top_movies = ratings['movieId'].value_counts().head(1000).index

filtered_ratings = ratings[ratings['userId'].isin(top_users) & ratings['movieId'].isin(top_movies)]

# Create user-item matrix
user_item_matrix = filtered_ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

# Convert to sparse matrix
user_item_sparse = csr_matrix(user_item_matrix.values)

# Compute cosine similarity
user_similarity = cosine_similarity(user_item_sparse)

print("User-Item Matrix shape:", user_item_matrix.shape)
print("User Similarity Matrix shape:", user_similarity.shape)


User-Item Matrix shape: (1000, 1000)
User Similarity Matrix shape: (1000, 1000)


Content-Based Filtering

In [9]:
tfidf = TfidfVectorizer(stop_words='english')
movie_features = tfidf.fit_transform(movies['title'])
movie_similarity = cosine_similarity(movie_features)


Hybrid Recommendation System


In [10]:
# Simple hybrid: average of user and item similarities (for demo)
hybrid_similarity = (user_similarity[:10, :10] + movie_similarity[:10, :10]) / 2
