In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import sqlite3
class IMDB:
    def __init__(self):#initialized class variable
        self.mergeData = ""
        self.genres = ["Action", "Adventure", "Animation", "Comedy", "Crime"]
        self.urls = ["https://www.imdb.com/search/title/?title_type=feature&genres=action&release_date=2024-01-01,2024-12-31",
            "https://www.imdb.com/search/title/?title_type=feature&genres=adventure&release_date=2024-01-01,2024-12-31",
            "https://www.imdb.com/search/title/?title_type=feature&genres=animation&release_date=2024-01-01,2024-12-31",
            "https://www.imdb.com/search/title/?title_type=feature&genres=comedy&release_date=2024-01-01,2024-12-31",
            "https://www.imdb.com/search/title/?title_type=feature&genres=crime&release_date=2024-01-01,2024-12-31"]
        try:
            self.driver = webdriver.Chrome() #create web driver
            print("Initilized: Crome Web-driver Successfully")
        except Exception as e:
            print("Error -> Crome Web-driver Initializing[__init__() function]\n\tERROR: ",e)      
        try:#Title,Duration,Rating,Votes,Genre
            self.conn = sqlite3.connect('IMDB.db') #database created
            self.cursor = self.conn.cursor()
            print("Initilized: Database Successfully")
            self.cursor.execute('''CREATE TABLE IF NOT EXISTS Movies (Title TEXT,Duration FLOAT,Rating FLOAT,Votes INT,Genre TEXT)''')
            print("Database Table create successfully")
        except Exception as e:
            print("Error -> Database Initializing[__init__() function]\n\tERROR: ",e)
    def store_data(self):
        try:#insert multiple rows into database table  #Title,Duration,Rating,Votes,Genre
            try:
                self.cursor.executemany("INSERT INTO Movies (Title, Duration, Rating, Votes, Genre) VALUES (?, ?, ?, ?, ?)", self.mergeData.itertuples(index=False, name=None))
                self.conn.commit()
            except Exception as e:
                print("Error - Insert data into Database[store_data() function]\n\tERROR: ",e)
                return False
            print("Cleaned data from the IMDB website was successfully inserted into the database table")
            print("Here, Sample data fron Database")
            a = self.cursor.execute("SELECT * FROM Movies LIMIT 5") #check insert rows successfully inserted or not
            for i in a:
                print(i)
            self.cursor.execute("PRAGMA table_info(Movies);")
            columns = self.cursor.fetchall()
            # Print column names and data types
            print("Column Name | Data Type")
            print("-" * 30)
            for col in columns:
                print(f"{col[1]} | {col[2]}")
            self.conn.close()
            print("cleaned data from IMDB website inseted into database table successfully")
        except Exception as e:
            print("Error - [store_data() function]\n\tERROR: ",e)
            return False
        return True
    def merge_data(self):
        try:
            d = [pd.read_csv(f"{i}-1.csv") for i in self.genres] # csv file already in Genre list, then append .csv 
        except Exception as e:
            print(f"Error → Unable to read all CSV files [merge_data() function]\n\tERROR:",e)
            return False
        self.mergeData = pd.concat(d, ignore_index=True)
        self.mergeData.drop_duplicates(subset=['Title', 'Duration', 'Rating','Votes','Genre'], keep='first', inplace=True) # delete dupulicate data's
        print("Info of Merged data in DataFrame:",'\n',self.mergeData.info())#print dataframe information
        print(f"count:{'\n'}{self.mergeData.count()}") #printhow many data in after merge all csv files into dataframe
        self.mergeData.to_csv(f"merge_data.csv", index=False)  #save scraped data into csv file
        print("merge_data.csv was created successfully")
        return True
    def change_Title(self,title):
        if pd.isna(title):
            return None
        title = re.sub(r'^\d+\.\s*', '', title) #remove index number(1.,2.,3.,....)
        return title.strip()
    def change_Duration(self,duration):  
        if pd.isna(duration): 
            return 0.0
        hours, minutes = 0, 0
        if "h" in duration:
            hours = int(duration.split("h")[0].strip())  
        if "m" in duration:
            minutes = int(duration.split("h")[-1].replace("m", "").strip())  
        return float(f"{hours}.{minutes:02d}")
    def change_votes(self,vote):
        if pd.isna(vote): 
            return 0
        if isinstance(vote, str):
            vote = vote.strip("() ").replace(",", "")  
            if vote.isdigit():  
                return int(vote)
            multiplier = 1
            if "K" in vote:
                multiplier = 1000
                vote = vote.replace("K", "")
            elif "M" in vote:
                multiplier = 1000000
                vote = vote.replace("M", "")
            elif "B" in vote:
                multiplier = 1000000000
                vote = vote.replace("B", "")
            try:
                return int(float(vote) * multiplier) 
            except ValueError:
                return 0  
        return 0 
    def clean_data(self):
        for i in self.genres: #clean datas in csv files using loop statement
            try:
                a = pd.read_csv(f"{i}.csv")# read csv files 
            except Exception as e:
                print(f"Error -> read {i}.csv file[clean_data() function]\n\tERROR: ",e)
                return False
            a["Title"] = a["Title"].apply(self.change_Title).astype("str").str.replace("'", "") # remove number, dot(.), and strip the value
            a["Duration"] = a["Duration"].apply(self.change_Duration)#  change duration 
            a["Rating"] = pd.to_numeric(a["Rating"], errors="coerce").fillna(0.0).astype("float64")#   change Rating datatype
            a["Votes"] = a["Votes"].apply(self.change_votes).astype("Int64")# change votes
            a["Genre"] = a["Genre"].astype("str")#   change gener datatype
            a.drop_duplicates(subset=['Title', 'Duration', 'Rating','Votes','Genre'], keep='first', inplace=True) # delete dupulicate data's
            a = a[(a != 0).all(axis=1)] # remove row if any row value is 0(zero)
            a.to_csv(f"{i}-1.csv", index=False)
            print(a.info())
            print(f"Cleaned data in the {i}.csv file and stored it in {i}-1.csv successfully")
        return True    
    def scrape_data(self,url):
        data_dict = {'Title': [], 'Duration': [], 'Rating': [], 'Votes': []}
        try:
            self.driver.get(url)
            print("URL link loaded into the web driver")
        except Exception as e:
            print("Error -> load URL link[scrape_data() function]\n\tERROR: ",e)
            return False
        time.sleep(3)
        print("Starting to click '50 more' buttons")  
        while True:
            try:
                load_more = self.driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button')
                ActionChains(self.driver).move_to_element(load_more).perform()
                load_more.click()
                time.sleep(3)
            except Exception as e:
                print("click '50 more' buttons process Stopped")
                break
        movies_item = self.driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')
        temp =0
        print("Started Extracting Movie's details from the Website")
        for movies in movies_item:
            try:
                title = movies.find_element(By.XPATH, "./div/div/div/div[1]/div[2]/div[1]/a/h3").text
                duration = movies.find_element(By.XPATH, "./div/div/div/div[1]/div[2]/div[2]/span[2]").text
                rating  = movies.find_element(By.XPATH, "./div/div/div/div[1]/div[2]/span/div/span/span[1]").text
                voting = movies.find_element(By.XPATH, "./div/div/div/div[1]/div[2]/span/div/span/span[2]").text
                data_dict['Title'].append(title)
                data_dict['Rating'].append(rating)
                data_dict['Votes'].append(voting)
                data_dict['Duration'].append(duration)
            except Exception as e:
                temp += 1
        print(f"Incomplete - {temp} Movie's details cannot be Extracted from Website")
        return data_dict
    def get_data(self): 
        if self.driver is None:  # Check if web driver is created
            print("Crome Web-driver not Initialized")
            return False
        for i, url in enumerate(self.urls):
            print(f"Scraping {self.genres[i]} movies...")
            data = self.scrape_data(url)
            df = pd.DataFrame(data)
            df['Genre'] = self.genres[i]
            df.to_csv(f"{self.genres[i]}.csv", index=False)
            print(f"Scraped {df['Title'].count()} {self.genres[i]} Movie details stored in {self.genres[i]}.csv successfully")
        self.driver.quit()
        print("Data scraping process completed")
        return True
obj = IMDB()
if obj.get_data():
    if obj.clean_data():
        if obj.merge_data():
            if obj.store_data():
                print("Code Execution complete")