In [1]:
from pymongo import MongoClient

# Connect to MongoDB (adjust the connection string as needed)
client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB URI
db = client['movie_recommendation_system']  # Database name
db = client['config']

In [3]:
# Access the 'config' database
db = client['config']

# List all collections in the 'config' database
collections = db.list_collection_names()
print(f"Collections in the 'config' database: {collections}")

Collections in the 'config' database: ['Users', 'system.sessions', 'Movies', 'Ratings']


In [5]:
# Fetch and display a sample from the 'Movies' collection
movies_sample = db['Movies'].find().limit(5)
print("Movies Collection Sample:")
for movie in movies_sample:
    print(movie)

# Fetch and display a sample from the 'Ratings' collection
ratings_sample = db['Ratings'].find().limit(5)
print("\nRatings Collection Sample:")
for rating in ratings_sample:
    print(rating)

# Fetch and display a sample from the 'Users' collection
users_sample = db['Users'].find().limit(5)
print("\nUsers Collection Sample:")
for user in users_sample:
    print(user)

Movies Collection Sample:
{'_id': ObjectId('67a3388e651d35131c6e07c7'), 'movieId': 1, 'title': 'Toy Story (1995)', 'genres': "Animation|Children's|Comedy"}
{'_id': ObjectId('67a3388e651d35131c6e07c8'), 'movieId': 2, 'title': 'Jumanji (1995)', 'genres': "Adventure|Children's|Fantasy"}
{'_id': ObjectId('67a3388e651d35131c6e07c9'), 'movieId': 3, 'title': 'Grumpier Old Men (1995)', 'genres': 'Comedy|Romance'}
{'_id': ObjectId('67a3388e651d35131c6e07ca'), 'movieId': 4, 'title': 'Waiting to Exhale (1995)', 'genres': 'Comedy|Drama'}
{'_id': ObjectId('67a3388e651d35131c6e07cb'), 'movieId': 5, 'title': 'Father of the Bride Part II (1995)', 'genres': 'Comedy'}

Ratings Collection Sample:
{'_id': ObjectId('67a33899651d35131c6e16f2'), 'userId': 1, 'movieId': 1193, 'rating': 5, 'timestamp': 978300760}
{'_id': ObjectId('67a33899651d35131c6e16f3'), 'userId': 1, 'movieId': 661, 'rating': 3, 'timestamp': 978302109}
{'_id': ObjectId('67a33899651d35131c6e16f4'), 'userId': 1, 'movieId': 914, 'rating': 3, 

In [7]:
# Check the number of documents in each collection
movies_count = db['Movies'].count_documents({})
ratings_count = db['Ratings'].count_documents({})
users_count = db['Users'].count_documents({})

print(f"Movies Collection has {movies_count} documents.")
print(f"Ratings Collection has {ratings_count} documents.")
print(f"Users Collection has {users_count} documents.")

Movies Collection has 3883 documents.
Ratings Collection has 1000209 documents.
Users Collection has 6040 documents.


In [9]:
import pandas as pd

# Fetch all data from collections
movies_data = list(db['Movies'].find())
ratings_data = list(db['Ratings'].find())
users_data = list(db['Users'].find())

# Convert to DataFrame for easier handling
movies_df = pd.DataFrame(movies_data)
ratings_df = pd.DataFrame(ratings_data)
users_df = pd.DataFrame(users_data)

# Check the structure of the data
print(movies_df.head())
print(ratings_df.head())
print(users_df.head())

                        _id  movieId                               title  \
0  67a3388e651d35131c6e07c7        1                    Toy Story (1995)   
1  67a3388e651d35131c6e07c8        2                      Jumanji (1995)   
2  67a3388e651d35131c6e07c9        3             Grumpier Old Men (1995)   
3  67a3388e651d35131c6e07ca        4            Waiting to Exhale (1995)   
4  67a3388e651d35131c6e07cb        5  Father of the Bride Part II (1995)   

                         genres  
0   Animation|Children's|Comedy  
1  Adventure|Children's|Fantasy  
2                Comedy|Romance  
3                  Comedy|Drama  
4                        Comedy  
                        _id  userId  movieId  rating  timestamp
0  67a33899651d35131c6e16f2       1     1193       5  978300760
1  67a33899651d35131c6e16f3       1      661       3  978302109
2  67a33899651d35131c6e16f4       1      914       3  978301968
3  67a33899651d35131c6e16f5       1     3408       4  978300275
4  67a33899651d3513

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocess the movie data (e.g., use the 'genres' column to create content features)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Example: Get similar movies for a specific movieId (say, movieId = 1)
movie_idx = movies_df[movies_df['movieId'] == 1].index[0]
similarities = list(enumerate(cosine_sim[movie_idx]))

# Sort the movies by similarity and get the top N most similar movies
similar_movies = sorted(similarities, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 similar movies
for idx, sim in similar_movies:
    print(f"Movie: {movies_df['title'][idx]}, Similarity: {sim}")

Movie: Aladdin and the King of Thieves (1996), Similarity: 1.0
Movie: American Tail, An (1986), Similarity: 1.0
Movie: American Tail: Fievel Goes West, An (1991), Similarity: 1.0
Movie: Rugrats Movie, The (1998), Similarity: 1.0
Movie: Bug's Life, A (1998), Similarity: 1.0


In [11]:
import pandas as pd

movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
users_df = pd.read_csv('users.csv')

print(movies_df.head())
print(ratings_df.head())
print(users_df.head())

   movieId                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   userId gender  age  occupation    zip
0       1      F    1          10  48067
1       2      M   56          16  70072
2       3      M   25          15  55117
3       4      M   45           7  02460
4       5      M   25          20  55455


In [15]:
print(movies_df.columns)  
print(ratings_df.columns) 
print(users_df.columns)  

Index(['movieId', 'title', 'genres'], dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Index(['userId', 'gender', 'age', 'occupation', 'zip'], dtype='object')


In [17]:
print(movies_df.shape)  
print(ratings_df.shape)
print(users_df.shape)

(3883, 3)
(1000209, 4)
(6040, 5)


In [19]:
merged_df = ratings_df.merge(movies_df, on="movieId", how="inner").merge(users_df, on="userId", how="inner")
print(merged_df.head())

   userId  movieId  rating  timestamp                                   title  \
0       1     1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1      661       3  978302109        James and the Giant Peach (1996)   
2       1      914       3  978301968                     My Fair Lady (1964)   
3       1     3408       4  978300275                  Erin Brockovich (2000)   
4       1     2355       5  978824291                    Bug's Life, A (1998)   

                         genres gender  age  occupation    zip  
0                         Drama      F    1          10  48067  
1  Animation|Children's|Musical      F    1          10  48067  
2               Musical|Romance      F    1          10  48067  
3                         Drama      F    1          10  48067  
4   Animation|Children's|Comedy      F    1          10  48067  


In [21]:
def get_movie_details(movie_name, merged_df):
    # Filter dataframe for the given movie name
    movie_info = merged_df[merged_df['title'].str.contains(movie_name, case=False, na=False, regex=False)]
    
    if movie_info.empty:
        return "Movie not found!"
    
    return movie_info  # Return all details

# Example usage
user_input = input("Enter movie name: ")
result = get_movie_details(user_input, merged_df)
print(result)

Enter movie name:  Jumanji


        userId  movieId  rating  timestamp           title  \
839         10        2       5  979168267  Jumanji (1995)   
1367        13        2       3  978202563  Jumanji (1995)   
1967        18        2       2  978152541  Jumanji (1995)   
2871        23        2       2  978461604  Jumanji (1995)   
3778        27        2       1  978129692  Jumanji (1995)   
...        ...      ...     ...        ...             ...   
993046    6000        2       3  956884937  Jumanji (1995)   
993544    6001        2       3  956805460  Jumanji (1995)   
994516    6006        2       2  957032380  Jumanji (1995)   
996986    6019        2       4  956761170  Jumanji (1995)   
998006    6030        2       4  956719238  Jumanji (1995)   

                              genres gender  age  occupation    zip  
839     Adventure|Children's|Fantasy      F   35           1  95370  
1367    Adventure|Children's|Fantasy      M   45           1  93304  
1967    Adventure|Children's|Fantasy      F  

In [None]:
def get_movie_summary(movie_name, merged_df):
    movie_info = merged_df[merged_df['title'].str.contains(movie_name, case=False, na=False, regex=False)]
    
    if movie_info.empty:
        return "Movie not found!"
    
    # Aggregate details
    summary = {
        "Movie ID": movie_info["movieId"].iloc[0],
        "Title": movie_info["title"].iloc[0],
        "Genres": movie_info["genres"].iloc[0],
        "Average Rating": round(movie_info["rating"].mean(), 2),
        "Total Ratings": movie_info["rating"].count(),
        "Most Common Age Group": movie_info["age"].mode()[0],
        "Most Common Occupation": movie_info["occupation"].mode()[0],
    }
    
    return summary

# Example usage
user_input = input("Enter movie name: ")
print(get_movie_summary(user_input, merged_df))

In [33]:
pip uninstall kafka-python six --yes

Found existing installation: kafka-python 2.0.2
Uninstalling kafka-python-2.0.2:
  Successfully uninstalled kafka-python-2.0.2
Found existing installation: six 1.17.0
Uninstalling six-1.17.0:
  Successfully uninstalled six-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [37]:
!pip install six
!pip install kafka-python==1.4.7

Collecting six
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: six
Successfully installed six-1.17.0
Collecting kafka-python==1.4.7
  Using cached kafka_python-1.4.7-py2.py3-none-any.whl.metadata (7.6 kB)
Using cached kafka_python-1.4.7-py2.py3-none-any.whl (266 kB)
Installing collected packages: kafka-python
Successfully installed kafka-python-1.4.7


In [41]:
pip install confluent-kafka

Collecting confluent-kafka
  Downloading confluent_kafka-2.8.0-cp312-cp312-win_amd64.whl.metadata (22 kB)
Downloading confluent_kafka-2.8.0-cp312-cp312-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.0 MB 660.6 kB/s eta 0:00:06
    --------------------------------------- 0.1/4.0 MB 544.7 kB/s eta 0:00:08
    --------------------------------------- 0.1/4.0 MB 655.4 kB/s eta 0:00:06
    --------------------------------------- 0.1/4.0 MB 655.4 kB/s eta 0:00:06
   - -------------------------------------- 0.2/4.0 MB 573.4 kB/s eta 0:00:07
   -- ------------------------------------- 0.2/4.0 MB 724.0 kB/s eta 0:00:06
   -- ------------------------------------- 0.3/4.0 MB 749.3 kB/s eta 0:00:05
   --- ------------------------------------ 0.3/4.0 MB 791.9 kB/s eta 0:00:05
   --- ------------------------------------ 0.3/4.0 MB 800.8 kB

In [43]:
from confluent_kafka import Producer

In [45]:
pip show kafka-python

Name: kafka-python
Version: 1.4.7
Summary: Pure Python client for Apache Kafka
Home-page: https://github.com/dpkp/kafka-python
Author: Dana Powers
Author-email: dana.powers@gmail.com
License: Apache License 2.0
Location: C:\Users\user\anaconda3\Lib\site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [63]:
from confluent_kafka import Producer

# Kafka configuration
conf = {'bootstrap.servers': 'localhost:9092'}  # Change this to your Kafka server

# Create a Kafka producer
producer = Producer(conf)

# Define a delivery callback (optional)
def delivery_callback(err, msg):
    if err:
        print(f"Message failed: {err}")
    else:
        print(f"Message delivered to {msg.topic()} [{msg.partition()}]")

# Produce a message
producer.produce('ratings_topic', key='user123', value='{"userId": 123, "movieId": 50, "rating": 4.5}', callback=delivery_callback)

# Wait for all messages to be delivered
producer.flush()

Message delivered to ratings_topic [0]


0

In [None]:
from confluent_kafka import Consumer

# Kafka consumer configuration
conf = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'movie-recommendation-group',
    'auto.offset.reset': 'earliest'
}

# Create Kafka consumer
consumer = Consumer(conf)
consumer.subscribe(['ratings_topic'])

# Read messages
while True:
    msg = consumer.poll(1.0)  # Wait up to 1 second for a message
    if msg is None:
        continue
    if msg.error():
        print(f"Consumer error: {msg.error()}")
        continue

    print(f"Received message: {msg.value().decode('utf-8')}")

consumer.close()

Received message: Hello Kafka!
Received message: {"userId": 123, "movieId": 50, "rating": 4.5}
Received message: {"userId": 123, "movieId": 50, "rating": 4.5}


In [61]:
from confluent_kafka import Producer

conf = {'bootstrap.servers': 'localhost:9092'}

producer = Producer(conf)
producer.produce('ratings_topic', value='{"userId": 123, "movieId": 50, "rating": 4.5}')
producer.flush()

print("Message sent!")

Message sent!


In [None]:
from confluent_kafka.admin import AdminClient

admin_client = AdminClient({"bootstrap.servers": "localhost:9092"})
print(admin_client.list_topics().topics)