## Basic Recommendation function

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample data: Product IDs and their descriptions
products_data = {
    'product_id': [1, 2, 3, 4, 5],
    'product_name': ['Smartphone', 'Laptop', 'Headphones', 'Smartwatch', 'Tablet'],
    'category': ['Electronics', 'Electronics', 'Accessories', 'Electronics', 'Electronics'],
    'description': [
        'Latest model with great battery life and high-end features',
        'Powerful laptop with fast processor and good storage',
        'Noise-cancelling headphones with deep bass and comfort',
        'Stylish smartwatch with fitness tracking and heart rate monitor',
        'Portable tablet with a sleek design and great screen resolution'
    ]
}

# Create DataFrame
df_products = pd.DataFrame(products_data)


In [2]:
# Function to recommend products based on user preferences
def recommend_products(user_preferences, num_recommendations=3):
    # Initialize TF-IDF Vectorizer
    tfidf = TfidfVectorizer(stop_words='english')
    
    # Fit the TF-IDF model on the product descriptions
    tfidf_matrix = tfidf.fit_transform(df_products['description'])
    
    # Compute cosine similarity between user preferences and products
    user_tfidf = tfidf.transform([user_preferences])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)
    
    # Get the indices of the most similar products
    similar_products_idx = cosine_sim[0].argsort()[-num_recommendations:][::-1]
    
    # Get recommended products
    recommended_products = df_products.iloc[similar_products_idx]
    
    return recommended_products[['product_name', 'description']]

# Example: A user preferences string
user_preferences = "I like powerful laptops with fast processors and good storage"

# Get recommendations
recommendations = recommend_products(user_preferences)
print("Recommended Products:")
print(recommendations)


Recommended Products:
  product_name                                        description
1       Laptop  Powerful laptop with fast processor and good s...
4       Tablet  Portable tablet with a sleek design and great ...
3   Smartwatch  Stylish smartwatch with fitness tracking and h...


## Movie Recommendation system

In [3]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
     ---------------------------------------- 0.0/154.4 kB ? eta -:--:--
     ---------------------------------------- 0.0/154.4 kB ? eta -:--:--
     -- ------------------------------------- 10.2/154.4 kB ? eta -:--:--
     -- ------------------------------------- 10.2/154.4 kB ? eta -:--:--
     ------- ----------------------------- 30.7/154.4 kB 262.6 kB/s eta 0:00:01
     -------------- ---------------------- 61.4/154.4 kB 409.6 kB/s eta 0:00:01
     ----------------------- ------------ 102.4/154.4 kB 535.8 kB/s eta 0:00:01
     -----------------------------------  153.6/154.4 kB 654.6 kB/s eta 0:00:01
     ------------------------------------ 154.4/154.4 kB 576.8 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requiremen


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD, accuracy
from surprise.model_selection import train_test_split

# Load data
file_path = 'ml-100k/u.data'

# Load the ratings data
df_ratings = pd.read_csv(file_path, sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'], header=None)

# Load movie metadata
movie_file_path = 'ml-100k/u.item'
df_movies = pd.read_csv(movie_file_path, sep='|', names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'], encoding='latin-1')

# Merge dataframes to have movie titles along with ratings
df_ratings_with_titles = pd.merge(df_ratings, df_movies[['movie_id', 'title']], on='movie_id')

# Show the first few rows of the ratings data
print(df_ratings_with_titles.head())


   user_id  movie_id  rating  timestamp  title
0      308         1       4  887736532      0
1      308         1       4  887736532      0
2      308         1       4  887736532      0
3      308         1       4  887736532      0
4      308         1       4  887736532      0


we will use the SVD (Singular Value Decomposition) algorithm for collaborative filtering.

In [42]:
from surprise import SVD
from surprise import Reader

# Define the data format for the Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['user_id', 'movie_id', 'rating']], reader)

# Split the data into training and test sets (80% train, 20% test)
trainset, testset = train_test_split(data, test_size=0.2)

# Use the SVD algorithm
svd = SVD()

# Train the model
svd.fit(trainset)

# Make predictions
predictions = svd.test(testset)

# Evaluate the performance of the model
accuracy.rmse(predictions)


RMSE: 0.9284


0.9284478560687779

In [8]:
print(f"User 2's ratings:")
print(df_ratings[df_ratings['user_id'] == 2])


User 2's ratings:
       user_id  movie_id  rating  timestamp
700          2       292       4  888550774
924          2       251       5  888552084
1052         2        50       5  888552084
3425         2       314       1  888980085
5063         2       297       4  888550871
...        ...       ...     ...        ...
77906        2       288       3  888550252
85606        2       286       4  888549960
88190        2       275       5  888550939
95677        2       302       5  888552084
97619        2       296       3  888550871

[62 rows x 4 columns]


In [26]:
# Example: Define the user_id you want to get recommendations for
user_id = 4  # Replace with the appropriate user ID

# Now, use the user_id to filter the ratings
rated_movies = df_ratings[df_ratings['user_id'] == user_id]['movie_id'].tolist()


In [43]:
def get_movie_recommendations(user_id):
    # Get all movie IDs
    all_movie_ids = df_movies_clean['movie_id'].tolist()

    # Get a list of movie IDs that the user has already rated
    rated_movies = df_ratings[df_ratings['user_id'] == user_id]['movie_id'].tolist()
    
    # Debug: Check user-rated movies
    print(f"User {user_id} has rated {len(rated_movies)} movies.")
    print(f"Rated movie IDs: {rated_movies}")

    # Filter out movies that the user has already rated
    movies_to_predict = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movies]
    
    # Debug: Check valid movies for prediction
    print(f"Movies to predict for user {user_id}: {movies_to_predict[:10]}")  # Show first 10 for debugging

    if not movies_to_predict:
        print("No valid movies to predict.")
        return []

    # Get the predicted ratings for the valid movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in movies_to_predict]
    
    # Sort predictions by estimated rating (highest first)
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Get top N movie recommendations
    top_recommendations = sorted_predictions[:5]

    recommended_movies = []
    for prediction in top_recommendations:
        movie_id = prediction.iid
        movie_title = df_movies_clean[df_movies_clean['movie_id'] == movie_id]['title'].values[0]
        recommended_movies.append(movie_title)

    # Return recommended movies
    return recommended_movies


In [44]:
# Test with user ID 4 (or any other valid user ID)
user_id = 4
recommended_movies = get_movie_recommendations(user_id)

# Output the recommendations
print("Top 5 Movie Recommendations for User 4:")
for movie in recommended_movies:
    print(movie)


User 4 has rated 24 movies.
Rated movie IDs: [264, 303, 361, 357, 260, 356, 294, 288, 50, 354, 271, 300, 328, 258, 210, 329, 11, 327, 324, 359, 362, 358, 360, 301]
Movies to predict for user 4: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Top 5 Movie Recommendations for User 4:
0
0
0
0
0


### Using a different filtering method

In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load movie metadata
movie_file_path = 'ml-100k/u.item'
df_movies = pd.read_csv(movie_file_path, sep='|', names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western'], encoding='latin-1')

# Select relevant columns (genres here, but you can add other features)
df_movies['genres'] = df_movies.iloc[:, 6:].apply(lambda row: ' '.join(row.index[row == 1]), axis=1)


In [33]:
# Vectorize genres using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_movies['genres'])

In [34]:
# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [36]:
# Check if the movie_id exists in the dataframe
movie_id_to_test = 1
if movie_id_to_test in df_movies['movie_id'].values:
    recommended_movies = get_content_based_recommendations(movie_id=movie_id_to_test)
    print("Top 5 Content-Based Recommendations:")
    for movie in recommended_movies:
        print(movie)
else:
    print(f"Movie ID {movie_id_to_test} not found in the dataframe.")


Movie ID 1 not found in the dataframe.


In [37]:
# Check the range of movie IDs in df_movies
print("Minimum movie ID:", df_movies['movie_id'].min())
print("Maximum movie ID:", df_movies['movie_id'].max())


Minimum movie ID: 'Til There Was You (1997)
Maximum movie ID: Á köldum klaka (Cold Fever) (1994)


In [38]:
# Load the movie data with a proper header (if the first row contains actual data and not column names)
df_movies = pd.read_csv(movie_file_path, sep='|', header=None, names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'], encoding='latin-1')

# Check the first few rows
print(df_movies.head())


                                                                                                                    movie_id  \
1 Toy Story (1995)  01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%20... 0 0 0 1 1 1 0 0 0 0 0 0 0 0         0   
2 GoldenEye (1995)  01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(1... 0 1 1 0 0 0 0 0 0 0 0 0 0 0         0   
3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%2... 0 0 0 0 0 0 0 0 0 0 0 0 0 0         0   
4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%2... 0 1 0 0 0 1 0 0 1 0 0 0 0 0         0   
5 Copycat (1995)    01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995)  0 0 0 0 0 0 1 0 1 0 0 0 0 0         0   

                                                                                                                    title  \
1 Toy Story (1995)  01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%20... 0 0 0 1 1 1 0 0 0 0

In [39]:
import pandas as pd

# Load the dataset (assuming '|' is the delimiter and encoding is 'latin-1')
movie_file_path = 'ml-100k/u.item'  # Update this path as needed
df_movies = pd.read_csv(movie_file_path, sep='|', header=None, encoding='latin-1')

# Inspect the first few rows to see the data structure
print(df_movies.head())

# Define column names based on the expected format
df_movies.columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] + [f'genre_{i}' for i in range(19)]

# Check the first few rows again with proper column names
print(df_movies[['movie_id', 'title']].head())

# Clean the 'movie_id' column to ensure it's numeric (and handle extra spaces or characters)
df_movies['movie_id'] = df_movies['movie_id'].astype(str).str.strip().astype(int)

# Ensure movie_id column is of type int
print(df_movies.dtypes)

# Check the minimum and maximum movie_id
print("Minimum movie ID:", df_movies['movie_id'].min())
print("Maximum movie ID:", df_movies['movie_id'].max())

# Check the first few rows of cleaned data
print(df_movies[['movie_id', 'title']].head())


   0                  1            2   3   \
0   1   Toy Story (1995)  01-Jan-1995 NaN   
1   2   GoldenEye (1995)  01-Jan-1995 NaN   
2   3  Four Rooms (1995)  01-Jan-1995 NaN   
3   4  Get Shorty (1995)  01-Jan-1995 NaN   
4   5     Copycat (1995)  01-Jan-1995 NaN   

                                                  4   5   6   7   8   9   ...  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...   0   0   0   1   1  ...   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...   0   1   1   0   0  ...   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...   0   0   0   0   0  ...   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...   0   1   0   0   0  ...   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)   0   0   0   0   0  ...   

   14  15  16  17  18  19  20  21  22  23  
0   0   0   0   0   0   0   0   0   0   0  
1   0   0   0   0   0   0   0   1   0   0  
2   0   0   0   0   0   0   0   1   0   0  
3   0   0   0   0   0   0   0   0   0   0  
4   0   0   0   0   0 

In [48]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Extract genre columns (from 'genre_0' to 'genre_18')
genre_columns = [f'genre_{i}' for i in range(19)]
movie_features = df_movies[genre_columns]

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(movie_features)

# Function to get movie recommendations
def get_content_based_recommendations(movie_id, top_n=5):
    # Get the index of the movie
    idx = df_movies[df_movies['movie_id'] == movie_id].index[0]

    # Get pairwise similarity scores of all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top N most similar movies
    top_similar_movies = sim_scores[1:top_n+1]  # Exclude the first one as it is the movie itself
    
    # Get movie IDs and titles of the top N similar movies
    recommended_movie_ids = [df_movies.iloc[i[0]]['movie_id'] for i in top_similar_movies]
    recommended_movie_titles = [df_movies.iloc[i[0]]['title'] for i in top_similar_movies]
    
    return recommended_movie_titles

# Test the recommendation system 
recommended_movies = get_content_based_recommendations(movie_id=4)
print("Top 5 Content-Based Recommendations for user 4:")
for movie in recommended_movies:
    print(movie)


Top 5 Content-Based Recommendations for user 4:
Faster Pussycat! Kill! Kill! (1965)
Best Men (1997)
Doom Generation, The (1995)
Eat Drink Man Woman (1994)
Ed Wood (1994)
