In [1]:
import os
import re
os.chdir('../')

In [25]:
import json
import time
import requests
import numpy as np
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
  

In [4]:
from src.general_utils import util

In [5]:
cities = [
    'bengaluru',
    # 'national-capital-region-ncr',
]

In [26]:
class GetMovieDetails:
    def __init__(self, path_movie_url, movie_name, date, location):
        
        self.path_movie_url = path_movie_url   
        self.movie_name = movie_name 
        self.date = date
        self.location = location
        self.formatted_time = self._get_current_time()

    def _get_current_time(self):
        # Get the current time in seconds since the epoch
        current_time_seconds = time.time()

        # Convert seconds since the epoch to a time structure
        time_struct = time.localtime(current_time_seconds)

        # Format the time in 24-hour format (HH:MM:SS)
        formatted_time = time.strftime("%H:%M", time_struct)
        return formatted_time
    
    def _get_header(self):
        headers = {
                'Referer': 'https://in.bookmyshow.com/explore/movies-bengaluru?languages=hindi',
                'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': "Linux",
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                }
        return headers
        
    
    def _get_chrome_driver(self):
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver = webdriver.Chrome()
        driver.get(self.path_movie_url)
        return driver

    
    def convert_to_json(self, data_categories:str):
        return json.loads(data_categories)

    def get_price(self, data_json):
        return data_json.get('price')

    def get_desc(self, data_json):
        return data_json.get('desc')

    def get_availability_class(self, data_json):
        return data_json.get('availabilityClass')

    def get_availStatus(self, data_json):
        return data_json.get('availStatus')

    def get_processed_df(self):
        driver = self._get_chrome_driver()
        
        print(driver)
        theatre_element_list = driver.find_elements(By.CLASS_NAME, 'list')
        print(f'Number of theatres : {len(theatre_element_list)}')

        data = []

        for i, a in tqdm(enumerate(theatre_element_list)):
            try:
                titles = a.find_element(By.CLASS_NAME, '__title')
                listing_info = a.find_elements(By.CLASS_NAME, 'listing-info')
                showtimes = a.find_elements(By.CLASS_NAME, 'showtime-pill')

                for s in showtimes:

                    data_categories = self.convert_to_json(s.get_attribute('data-categories'))
                    timings = s.get_attribute('data-date-time')

                    temp_data = {
                        'cinema_title' : titles.text,
                        'timings': timings,
                        'latitude': a.get_attribute('data-lat'),
                        'longtitude' : a.get_attribute('data-lng'),

                        'price': self.get_price(data_categories),
                        'description': self.get_desc(data_categories),

                        'availability_class': self.get_availability_class(data_categories),
                        'avail_status': self.get_availStatus(data_categories),
                        'movie_name' : self.movie_name,
                        'date' : self.date,
                        'location' : self.location,
                        'checked_time' : self.formatted_time,
                    }
                    
                    data.append(temp_data)

                # b.find_element('showtime-pill')
            except:
                print(i)

        
        data_df = pd.DataFrame(data)

        return data_df
        

In [9]:
class GetMoviesList:
    def __init__(self, path_movie_list_url, location, date):
        self.date = date
        self.location = location
        self.path_movie_list_url = path_movie_list_url    
        self.movie_ahref_string = f'https://in.bookmyshow.com/{location}/movies'

    def _get_header(self):
        headers = {
                'Referer': 'https://in.bookmyshow.com/explore/movies-bengaluru?languages=hindi',
                'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': "Linux",
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                }
        return headers

    def _get_chrome_driver(self):
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver = webdriver.Chrome()
        driver.get(self.path_movie_list_url)
        
        # Scroll to buttom slowly to load all movies
        driver.execute_async_script(
            """
        count = 400;
        let callback = arguments[arguments.length - 1];
        t = setTimeout(function scrolldown(){
            console.log(count, t);
            window.scrollTo(0, count);
            if(count < (document.body.scrollHeight || document.documentElement.scrollHeight)){
              count+= 400;
              t = setTimeout(scrolldown, 1000);
            }else{
              callback((document.body.scrollHeight || document.documentElement.scrollHeight));
            }
        }, 1000);"""
        )
        return driver
    
    def parse_drive_to_soup(self, driver):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return soup
    
    def get_all_current_movie_list(self, soup):
        
        movie_soup_element_dict = {}
        
        movie_tags_list = [a for a in soup.find_all('a', href=True) if self.movie_ahref_string in a['href']]
        
        # Extract movie names from the URLs
        movie_names = []
        for link in movie_tags_list:
            # Extract the movie name from the URL
            movie_name_match = re.search(r'.*/movies/(.*)/ET.*', link['href'])
            if movie_name_match:
                movie_name = movie_name_match.group(1)
                movie_names.append(movie_name)

                movie_url = link['href']
                movie_url_basename = movie_url.split('/')[-1]
                
                movie_booking_url = f'https://in.bookmyshow.com/buytickets/{movie_name}-{self.location}/movie-bang-{movie_url_basename}-MT/{self.date}'

                movie_soup_element_dict[movie_name] = {
                    # 'element': link,
                    'url': movie_url,
                    'name': movie_name,
                    'booking_url' : movie_booking_url,
                    'date': self.date,
                    'location' : self.location,
                }
            
        # # Print the movie names
        # for movie_name in movie_names:
        #     print("Movie Name:", movie_name)

        
        print(f'Number of movies running: {len(movie_names)}')
        return movie_soup_element_dict
        
    def generate(self):
        driver = self._get_chrome_driver()
        soup = self.parse_drive_to_soup(driver)
        movie_soup_element_dict  = self.get_all_current_movie_list(soup)
        
        return movie_soup_element_dict

In [12]:
location = 'bengaluru'
date = '20230916'
path_movie_list_url = f'https://in.bookmyshow.com/explore/movies-{location}'
movie_list_instance = GetMoviesList(path_movie_list_url=path_movie_list_url, location=location, date=date)
movie_soup_element_dict = movie_list_instance.generate()

Number of movies running: 27


In [13]:
path_output = f'data/{location}_scrapped_{date}.json'
util.save_json(movie_soup_element_dict, path_output=path_output)

In [15]:
data_movie_list = []

for movie_name in tqdm(movie_soup_element_dict):
    path_movie_url = movie_soup_element_dict[movie_name]['booking_url']

    movie_details = GetMovieDetails(path_movie_url=path_movie_url,
                                    location=location, date=date,
                                    movie_name=movie_name,
                                    
                                    )
    movie_detail_df = movie_details.get_processed_df()
    data_movie_list.append(movie_detail_df)


  0%|          | 0/27 [00:00<?, ?it/s]

<selenium.webdriver.chrome.webdriver.WebDriver (session="4d51a17dd2c99c98f64192cf1df8a36c")>
Number of theatres : 48




0
1
2
3
4
5


48it [00:02, 19.49it/s]
  4%|▎         | 1/27 [00:18<07:51, 18.14s/it]

46
47
<selenium.webdriver.chrome.webdriver.WebDriver (session="86cb2cd6a6fee1c2ce91549be26649ab")>
Number of theatres : 97




0
1
2
3
4
5


97it [00:12,  7.83it/s]
  7%|▋         | 2/27 [00:44<09:30, 22.82s/it]

95
96
<selenium.webdriver.chrome.webdriver.WebDriver (session="7b9a52a3abaab4d6e01acca1a8e96cee")>
Number of theatres : 93




0
1
2
3
4
5


93it [00:06, 13.57it/s]
 11%|█         | 3/27 [01:06<09:03, 22.66s/it]

91
92
<selenium.webdriver.chrome.webdriver.WebDriver (session="03a2ec146124f21f7271ea09ce9bc588")>
Number of theatres : 68




0
1
2
3
4
5




34


68it [00:04, 15.93it/s]
 15%|█▍        | 4/27 [01:25<08:10, 21.32s/it]

66
67
<selenium.webdriver.chrome.webdriver.WebDriver (session="57e45ebfa84a7eca01388900eb0fc78f")>
Number of theatres : 57




0
1
2
3
4
5


57it [00:02, 19.18it/s]
 19%|█▊        | 5/27 [01:48<08:00, 21.85s/it]

55
56
<selenium.webdriver.chrome.webdriver.WebDriver (session="6e264ba505924fcc62f1618526aa3398")>
Number of theatres : 47




0
1
2
3
4
5


47it [00:02, 19.79it/s]
 22%|██▏       | 6/27 [02:10<07:35, 21.71s/it]

45
46
<selenium.webdriver.chrome.webdriver.WebDriver (session="2ab71785a5f4ea005b9a9354c5b86a8d")>
Number of theatres : 35




0
1
2
3
4
5


35it [00:01, 21.65it/s]
 26%|██▌       | 7/27 [02:31<07:10, 21.51s/it]

33
34
<selenium.webdriver.chrome.webdriver.WebDriver (session="ea1cdd9a5abbef6f2154e2b1ff7e6091")>
Number of theatres : 19




0
1
2
3
4
5


19it [00:00, 34.28it/s]
 30%|██▉       | 8/27 [02:46<06:08, 19.38s/it]

17
18
<selenium.webdriver.chrome.webdriver.WebDriver (session="2de1cf070f769e32ba071db319191c00")>
Number of theatres : 21




0
1
2
3
4
5


21it [00:00, 35.08it/s]
 33%|███▎      | 9/27 [03:03<05:35, 18.62s/it]

19
20
<selenium.webdriver.chrome.webdriver.WebDriver (session="2bb01f6dcb09a473745f4dafac0e2c38")>
Number of theatres : 12




0
1
2
3
4
5


12it [00:00, 40.53it/s]
 37%|███▋      | 10/27 [03:20<05:11, 18.34s/it]

10
11
<selenium.webdriver.chrome.webdriver.WebDriver (session="f5dc9ec6e528285b9d1052ba07bea101")>
Number of theatres : 11




0
1
2
3
4
5


11it [00:00, 55.23it/s]


9
10


 41%|████      | 11/27 [03:38<04:49, 18.09s/it]

<selenium.webdriver.chrome.webdriver.WebDriver (session="87927b790414a879dae8db8d825817c9")>
Number of theatres : 21




0
1
2
3
4
5


21it [00:00, 36.28it/s]
 44%|████▍     | 12/27 [04:01<04:52, 19.53s/it]

19
20
<selenium.webdriver.chrome.webdriver.WebDriver (session="e3eacf7efeeaed1dd936d986749d6580")>
Number of theatres : 18




0
1
2
3
4
5


18it [00:00, 38.38it/s]
 48%|████▊     | 13/27 [04:26<04:56, 21.18s/it]

16
17
<selenium.webdriver.chrome.webdriver.WebDriver (session="f39cab37dc5df00ba0c6f8f0b322fb1b")>
Number of theatres : 20




0
1
2
3
4
5


20it [00:00, 39.28it/s]
 52%|█████▏    | 14/27 [04:42<04:14, 19.60s/it]

18
19
<selenium.webdriver.chrome.webdriver.WebDriver (session="e7afa08afb392d2bf214049339b72b9c")>
Number of theatres : 16




0
1
2
3
4
5


16it [00:00, 34.86it/s]
 56%|█████▌    | 15/27 [04:56<03:36, 18.08s/it]

14
15
<selenium.webdriver.chrome.webdriver.WebDriver (session="a6d5d97112704c2630353d3f260a3dc9")>
Number of theatres : 18




0
1
2
3
4
5


18it [00:00, 39.80it/s]
 59%|█████▉    | 16/27 [05:12<03:11, 17.43s/it]

16
17
<selenium.webdriver.chrome.webdriver.WebDriver (session="77e219013ff26ba5f8e0ed496b1cfd69")>
Number of theatres : 16




0
1
2
3
4
5


16it [00:00, 39.59it/s]
 63%|██████▎   | 17/27 [05:25<02:40, 16.07s/it]

14
15
<selenium.webdriver.chrome.webdriver.WebDriver (session="33249f1bdec4a45ebff8ce75fec387fc")>
Number of theatres : 14




0
1
2
3
4
5


14it [00:00, 41.87it/s]
 67%|██████▋   | 18/27 [05:43<02:29, 16.58s/it]

12
13
<selenium.webdriver.chrome.webdriver.WebDriver (session="9ea88d0ced5b786d3cbe6b207d5917b5")>
Number of theatres : 13




0
1
2
3
4
5


13it [00:00, 45.27it/s]
 70%|███████   | 19/27 [06:04<02:23, 17.98s/it]

11
12
<selenium.webdriver.chrome.webdriver.WebDriver (session="438bfd5adb1aa4f028707e5233a47dec")>
Number of theatres : 12




0
1
2
3
4
5
10


12it [00:00, 57.71it/s]
 74%|███████▍  | 20/27 [06:20<02:01, 17.38s/it]

11
<selenium.webdriver.chrome.webdriver.WebDriver (session="ad2ef4adf46b96220c671bc45cce3a29")>
Number of theatres : 11


11it [00:00, 58.75it/s]

0
1
2
3
4
5
9
10



 78%|███████▊  | 21/27 [06:35<01:40, 16.67s/it]

<selenium.webdriver.chrome.webdriver.WebDriver (session="ff90afc77d3cdbf4d18e3363534c8f7b")>
Number of theatres : 12




0
1
2
3
4
5


12it [00:00, 49.34it/s]
 81%|████████▏ | 22/27 [06:51<01:22, 16.50s/it]

10
11
<selenium.webdriver.chrome.webdriver.WebDriver (session="f0c7eef84208d88290761aaa71907354")>
Number of theatres : 11




0
1
2
3
4
5


11it [00:00, 54.65it/s]

9
10



 85%|████████▌ | 23/27 [07:16<01:15, 18.93s/it]

<selenium.webdriver.chrome.webdriver.WebDriver (session="1ee4c4f56982681442b7c2a47a9b40f8")>
Number of theatres : 10


10it [00:00, 66.03it/s]

0
1
2
3
4
5
8
9



 89%|████████▉ | 24/27 [07:30<00:53, 17.68s/it]

<selenium.webdriver.chrome.webdriver.WebDriver (session="07b75db787d86f494f1874c33196bf38")>
Number of theatres : 9


9it [00:00, 87.78it/s]
 93%|█████████▎| 25/27 [07:46<00:34, 17.07s/it]

0
1
2
3
4
5
7
8
<selenium.webdriver.chrome.webdriver.WebDriver (session="42158a4e85cf5c90c65795debdb239e0")>
Number of theatres : 12




0
1
2
3
4
5


12it [00:00, 46.45it/s]
 96%|█████████▋| 26/27 [07:59<00:15, 15.96s/it]

10
11
<selenium.webdriver.chrome.webdriver.WebDriver (session="416b5da3439ac3b4a1c5b2a9cb88a944")>
Number of theatres : 9


9it [00:00, 88.27it/s]
100%|██████████| 27/27 [08:16<00:00, 18.37s/it]

0
1
2
3
4
5
7
8





In [17]:
data_movie_list_pd = pd.concat(data_movie_list)

In [18]:
data_movie_list_pd

Unnamed: 0,title,timings,latitude,longtitude,price,description,availability_class,avail_status
0,Akash Cinemas: Laggere\nINFO,10:30 AM,13.0145,77.516,100.00,Silver Class,_available,3
1,Akash Cinemas: Laggere\nINFO,01:30 PM,13.0145,77.516,100.00,Silver Class,_available,3
2,Cinephile HSR Layout: PNR Felicity Mall Haralu...,10:15 AM,12.8963,77.6579,236.00,GOLD,_available,3
3,Cinephile HSR Layout: PNR Felicity Mall Haralu...,05:15 PM,12.8963,77.6579,236.00,GOLD,_available,3
4,Cinepolis: Binnypet Mall\nINFO,11:05 AM,12.9676,77.5584,150.00,PREMIUM,_available,3
...,...,...,...,...,...,...,...,...
0,"PVR: Nexus (Formerly Forum), Koramangala\nINFO",10:05 PM,12.9346,77.6111,510.00,RECLINER,_available,3
1,"PVR: Orion Mall, Dr Rajkumar Road\nINFO",02:00 PM,13.011,77.5551,410.00,RECLINER,_available,3
2,"PVR: Phoenix Marketcity Mall, Whitefield Road\...",09:30 PM,12.9973,77.6957,480.00,RECLINER,_available,3
3,"PVR: Vega City, Bannerghatta Road\nINFO",04:25 PM,12.9077,77.6012,410.00,RECLINER,_available,3


In [204]:
# # for holding the resultant list
# element_list = []
  
# for page in range(1, 3, 1):
    
#     page_url = "https://webscraper.io/test-sites/e-commerce/static/computers/laptops?page=" + str(page)
#     driver = webdriver.Chrome(ChromeDriverManager().install())
#     driver.get(page_url)
#     title = driver.find_elements_by_class_name("title")
#     price = driver.find_elements_by_class_name("price")
#     description = driver.find_elements_by_class_name("description")
#     rating = driver.find_elements_by_class_name("ratings")
  
#     for i in range(len(title)):
#         element_list.append([title[i].text, price[i].text, description[i].text, rating[i].text])
  
# print(element_list)
  
# #closing the driver
# driver.close()