In [24]:
import pymongo
import pandas as pd
from dotenv import load_dotenv
import os
import logging

logging.basicConfig(level=logging.INFO)

In [28]:
class MongoDataExtractor:
    def __init__(self):
        """Khởi tạo và cấu hình kết nối MongoDB."""
        load_dotenv()
        self.db = self.connect_to_mongo()

    def connect_to_mongo(self):
        """Kết nối đến MongoDB."""
        mongo_uri = os.getenv('MONGO_URI')
        client = pymongo.MongoClient(mongo_uri)
        db_name = os.getenv('MONGODB_DATABASE', 'default_db_name').replace(' ', '_')
        db = client[db_name]
        return db

    def load_collection_as_dataframe(self, collection_name):
        """Lấy dữ liệu từ MongoDB và chuyển thành DataFrame."""
        collection = self.db[collection_name]
        data = list(collection.find({}))
        if not data:
            logging.warning(f"No data found in collection: {collection_name}")
            return pd.DataFrame()  
        return pd.DataFrame(data)

    def check_if_genres_processed(self):
        """Check if the movie_genres collection has been processed before."""
        return self.db['processing_flags'].find_one({'collection': 'movie_genres'}) is not None

    def mark_genres_as_processed(self):
        """Mark movie_genres as processed."""
        self.db['processing_flags'].insert_one({'collection': 'movie_genres'})

    def process_movie_genres(self, df):
        if not df.empty:
            df = df.drop(columns=['_id'])
        return df

    def process_all_collections(self):
        collections = [
            'movie_genres', 
            'movie_details', 
            'movie_reviews', 
            'movie_actor_credits', 
            'actor_details', 
            'movie_director_credits', 
            'director_details'
        ]

        transformed_data = {}

        for collection_name in collections:
            logging.info(f"Processing collection: {collection_name}")
            df = self.load_collection_as_dataframe(collection_name)

            if collection_name == 'movie_genres':
                # Check if movie_genres already processed
                if self.check_if_genres_processed():
                    logging.info(f"movie_genres already processed, skipping transformation.")
                else:
                    df = self.process_movie_genres(df)
                    self.mark_genres_as_processed()  
                    
            transformed_data[collection_name] = df

        return transformed_data

In [29]:
extractor = MongoDataExtractor()
transformed_data = extractor.process_all_collections()  

INFO:root:Processing collection: movie_genres
INFO:root:Processing collection: movie_details
INFO:root:Processing collection: movie_reviews
INFO:root:Processing collection: movie_actor_credits
INFO:root:Processing collection: actor_details
INFO:root:Processing collection: movie_director_credits
INFO:root:Processing collection: director_details


In [37]:
specific_collection = 'movie_details'

if specific_collection in transformed_data:
    df = transformed_data[specific_collection]
    print(f"DataFrame for collection: {specific_collection}, Data shape: {df.shape}")
    display(df)  
    print(f"Collection '{specific_collection}' not found in transformed data.")

DataFrame for collection: movie_details, Data shape: (11, 27)


Unnamed: 0,_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,67172d77f445c4f797c2ff5e,False,/zLj0peaxy5y2SlC6wNIQ4V0pfqg.jpg,,0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",https://www.netflix.com/title/81476885,1139829,tt28066777,[US],...,2024-02-01,0,90,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"Hello Darkness, my new friend.",Orion and the Dark,False,6.6,480
1,67173cb7f1dde4dc987364a9,False,/xuvQDwrpKzjlGyDE6RxLfVSPWuI.jpg,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,1126475,tt27158093,[GB],...,2024-09-19,0,92,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Who says you're only young once?,Arthur's Whisky,False,5.5,2
2,6717ae3c7dc2bacbe37f9f98,False,/xuvQDwrpKzjlGyDE6RxLfVSPWuI.jpg,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,1126475,tt27158093,[GB],...,2024-09-19,0,92,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Who says you're only young once?,Arthur's Whisky,False,5.5,2
3,6717ae6c7dc2bacbe37f9fde,False,/qBvWwXEn3BJlNaiKUmI9ABuHkSZ.jpg,,0,"[{'id': 27, 'name': 'Horror'}]",,1232645,tt26902267,[US],...,2024-01-18,0,111,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Vampire Next Door,False,6.4,29
4,6717ae927dc2bacbe37f9ff8,False,/vzO729sITNJ1jB6Y9AJpIhlbvb2.jpg,,0,"[{'id': 99, 'name': 'Documentary'}, {'id': 80,...",https://www.netflix.com/title/81507283,1213997,tt30317302,[US],...,2024-01-01,0,95,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Bitconned,False,6.3,43
5,6717aeb87dc2bacbe37fa008,False,/gn0OyRVzWGWmq4vybZOdTTRjA2N.jpg,,0,"[{'id': 35, 'name': 'Comedy'}]",,1070130,tt18232550,"[FR, CH]",...,2024-01-01,0,77,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,,The Rebellious,False,3.5,5
6,6717aece7dc2bacbe37fa037,False,,,0,"[{'id': 27, 'name': 'Horror'}]",,1151317,tt27726037,[US],...,2023-10-31,0,72,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Spider Baby,False,0.0,0
7,6717aee27dc2bacbe37fa05c,False,/AdHbzoSlrTdHKIC3j9XNa7qJKdI.jpg,,13000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",,1179316,tt21352916,[RU],...,2024-01-01,0,116,"[{'english_name': 'Russian', 'iso_639_1': 'ru'...",Released,,The Bremen Town Musicians,False,5.9,16
8,6717bfb93964419b4097fa1b,False,/xuvQDwrpKzjlGyDE6RxLfVSPWuI.jpg,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,1126475,tt27158093,[GB],...,2024-09-19,0,92,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Who says you're only young once?,Arthur's Whisky,False,5.5,2
9,6717bfd26326f18e66fd154b,False,/xuvQDwrpKzjlGyDE6RxLfVSPWuI.jpg,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,1126475,tt27158093,[GB],...,2024-09-19,0,92,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Who says you're only young once?,Arthur's Whisky,False,5.5,2


Collection 'movie_details' not found in transformed data.
