In [1]:
from pymongo import MongoClient

# Connect to MongoDB (adjust the connection string as needed)
client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB URI
db = client['movie_recommendation_system']  # Database name

In [25]:
# Access the 'config' database
db = client['config']

# List all collections in the 'config' database
collections = db.list_collection_names()
print(f"Collections in the 'config' database: {collections}")

Collections in the 'config' database: ['Users', 'system.sessions', 'Movies', 'Ratings']


In [35]:
# Fetch and display a sample from the 'Movies' collection
movies_sample = db['Movies'].find().limit(5)
print("Movies Collection Sample:")
for movie in movies_sample:
    print(movie)

# Fetch and display a sample from the 'Ratings' collection
ratings_sample = db['Ratings'].find().limit(5)
print("\nRatings Collection Sample:")
for rating in ratings_sample:
    print(rating)

# Fetch and display a sample from the 'Users' collection
users_sample = db['Users'].find().limit(5)
print("\nUsers Collection Sample:")
for user in users_sample:
    print(user)

Movies Collection Sample:
{'_id': ObjectId('67a3388e651d35131c6e07c7'), 'movieId': 1, 'title': 'Toy Story (1995)', 'genres': "Animation|Children's|Comedy"}
{'_id': ObjectId('67a3388e651d35131c6e07c8'), 'movieId': 2, 'title': 'Jumanji (1995)', 'genres': "Adventure|Children's|Fantasy"}
{'_id': ObjectId('67a3388e651d35131c6e07c9'), 'movieId': 3, 'title': 'Grumpier Old Men (1995)', 'genres': 'Comedy|Romance'}
{'_id': ObjectId('67a3388e651d35131c6e07ca'), 'movieId': 4, 'title': 'Waiting to Exhale (1995)', 'genres': 'Comedy|Drama'}
{'_id': ObjectId('67a3388e651d35131c6e07cb'), 'movieId': 5, 'title': 'Father of the Bride Part II (1995)', 'genres': 'Comedy'}

Ratings Collection Sample:
{'_id': ObjectId('67a33899651d35131c6e16f2'), 'userId': 1, 'movieId': 1193, 'rating': 5, 'timestamp': 978300760}
{'_id': ObjectId('67a33899651d35131c6e16f3'), 'userId': 1, 'movieId': 661, 'rating': 3, 'timestamp': 978302109}
{'_id': ObjectId('67a33899651d35131c6e16f4'), 'userId': 1, 'movieId': 914, 'rating': 3, 

In [37]:
# Check the number of documents in each collection
movies_count = db['Movies'].count_documents({})
ratings_count = db['Ratings'].count_documents({})
users_count = db['Users'].count_documents({})

print(f"Movies Collection has {movies_count} documents.")
print(f"Ratings Collection has {ratings_count} documents.")
print(f"Users Collection has {users_count} documents.")

Movies Collection has 3883 documents.
Ratings Collection has 1000209 documents.
Users Collection has 6040 documents.


In [39]:
import pandas as pd

# Fetch all data from collections
movies_data = list(db['Movies'].find())
ratings_data = list(db['Ratings'].find())
users_data = list(db['Users'].find())

# Convert to DataFrame for easier handling
movies_df = pd.DataFrame(movies_data)
ratings_df = pd.DataFrame(ratings_data)
users_df = pd.DataFrame(users_data)

# Check the structure of the data
print(movies_df.head())
print(ratings_df.head())
print(users_df.head())

                        _id  movieId                               title  \
0  67a3388e651d35131c6e07c7        1                    Toy Story (1995)   
1  67a3388e651d35131c6e07c8        2                      Jumanji (1995)   
2  67a3388e651d35131c6e07c9        3             Grumpier Old Men (1995)   
3  67a3388e651d35131c6e07ca        4            Waiting to Exhale (1995)   
4  67a3388e651d35131c6e07cb        5  Father of the Bride Part II (1995)   

                         genres  
0   Animation|Children's|Comedy  
1  Adventure|Children's|Fantasy  
2                Comedy|Romance  
3                  Comedy|Drama  
4                        Comedy  
                        _id  userId  movieId  rating  timestamp
0  67a33899651d35131c6e16f2       1     1193       5  978300760
1  67a33899651d35131c6e16f3       1      661       3  978302109
2  67a33899651d35131c6e16f4       1      914       3  978301968
3  67a33899651d35131c6e16f5       1     3408       4  978300275
4  67a33899651d3513

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the movie data (e.g., use the 'genres' column to create content features)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Example: Get similar movies for a specific movieId (say, movieId = 1)
movie_idx = movies_df[movies_df['movieId'] == 1].index[0]
similarities = list(enumerate(cosine_sim[movie_idx]))

# Sort the movies by similarity and get the top N most similar movies
similar_movies = sorted(similarities, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 similar movies
for idx, sim in similar_movies:
    print(f"Movie: {movies_df['title'][idx]}, Similarity: {sim}")

Movie: Aladdin and the King of Thieves (1996), Similarity: 1.0
Movie: American Tail, An (1986), Similarity: 1.0
Movie: American Tail: Fievel Goes West, An (1991), Similarity: 1.0
Movie: Rugrats Movie, The (1998), Similarity: 1.0
Movie: Bug's Life, A (1998), Similarity: 1.0


In [59]:
pip install --upgrade pip setuptools wheel

Collecting pip
  Downloading pip-25.0-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Using cached setuptools-75.8.0-py3-none-any.whl.metadata (6.7 kB)
Collecting wheel
  Using cached wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Downloading pip-25.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB 326.8 kB/s eta 0:00:06
   - -------------------------------------- 0.1/1.8 MB 438.9 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.8 MB 454.0 kB/s eta 0:00:04
   --- ------------------------------------ 0.2/1.8 MB 573.4 kB/s eta 0:00:03
   --- ------------------------------------ 0.2/1.8 MB 618.3 kB/s eta 0:00:03
   ---- ----------------------------------- 0.2/1.8 MB 625.1 kB/s eta 0:00:03
   ----- --------------------------

In [None]:
pip install scikit-surprise

In [55]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Prepare data for collaborative filtering
reader = Reader(rating_scale=(1, 5))  # Adjust scale if necessary
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

# Build the model (SVD)
model = SVD()
model.fit(trainset)

# Make predictions
predictions = model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

ModuleNotFoundError: No module named 'surprise'