In [1]:
import os
import re
os.chdir('../')

In [2]:
import json
import time
import requests
from datetime import datetime

import numpy as np
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
  



In [22]:
from config import params

from src.general_utils import util
from src.data_utils import data_loader


In [7]:
location_list = [
    'bengaluru',
    # 'national-capital-region-ncr',
    # 'mumbai',
    # 'chennai',
    # 'kolkata',
    # 'bhopal',
    # 'indore',
    # 'chandigarh',
    # 'ahmedabad',
    # 'hyderabad',
    # 'pune',
    # 'kochi',
    # 'varanasi',
    # 'jaipur',
    # 'bhubaneswar',
]

In [5]:
class GetMovieDetails:
    def __init__(self, path_movie_url, movie_name, date, location, if_debug=False):
        
        self.path_movie_url = path_movie_url   
        self.movie_name = movie_name 
        self.date = date
        self.location = location
        self.formatted_time = self._get_current_time()
        self.if_debug = if_debug

    def _get_current_time(self):
        # Get the current time in seconds since the epoch
        current_time_seconds = time.time()

        # Convert seconds since the epoch to a time structure
        time_struct = time.localtime(current_time_seconds)

        # Format the time in 24-hour format (HH:MM:SS)
        formatted_time = time.strftime("%H:%M", time_struct)
        return formatted_time
    
    def _get_header(self):
        headers = {
                'Referer': 'https://in.bookmyshow.com/explore/movies-bengaluru?languages=hindi',
                'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': "Linux",
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                }
        return headers
        
    
    def _get_chrome_driver(self):
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver = webdriver.Chrome()
        driver.get(self.path_movie_url)
        return driver

    
    def convert_to_json(self, data_categories:str):
        return json.loads(data_categories)

    def get_price(self, data_json):
        return data_json.get('price')

    def get_desc(self, data_json):
        return data_json.get('desc')

    def get_availability_class(self, data_json):
        return data_json.get('availabilityClass')

    def get_availStatus(self, data_json):
        return data_json.get('availStatus')

    def get_processed_df(self):
        driver = self._get_chrome_driver()
        
        if self.if_debug : print(driver)
        theatre_element_list = driver.find_elements(By.CLASS_NAME, 'list')
        if self.if_debug : print(f'Number of theatres : {len(theatre_element_list)}')

        data = []

        for i, a in tqdm(enumerate(theatre_element_list)):
            try:
                titles = a.find_element(By.CLASS_NAME, '__title')
                listing_info = a.find_elements(By.CLASS_NAME, 'listing-info')
                showtimes = a.find_elements(By.CLASS_NAME, 'showtime-pill')

                for s in showtimes:

                    data_categories = self.convert_to_json(s.get_attribute('data-categories'))
                    timings = s.get_attribute('data-date-time')

                    temp_data = {
                        'cinema_title' : titles.text.split('\nINFO')[0],
                        'timings': timings,
                        'latitude': a.get_attribute('data-lat'),
                        'longtitude' : a.get_attribute('data-lng'),

                        'price': self.get_price(data_categories),
                        'description': self.get_desc(data_categories),

                        'availability_class': self.get_availability_class(data_categories),
                        'avail_status': self.get_availStatus(data_categories),
                        'movie_name' : self.movie_name,
                        'date' : self.date,
                        'location' : self.location,
                        'checked_time' : self.formatted_time,
                    }
                    
                    data.append(temp_data)

                # b.find_element('showtime-pill')
            except:
                if self.if_debug : print(i)

        
        data_df = pd.DataFrame(data)

        return data_df
        

In [6]:
class GetMoviesList:
    def __init__(self, path_movie_list_url, location, date):
        self.date = date
        self.location = location
        self.path_movie_list_url = path_movie_list_url    
        self.movie_ahref_string = f'https://in.bookmyshow.com/{location}/movies'

    def _get_header(self):
        headers = {
                'Referer': 'https://in.bookmyshow.com/explore/movies-bengaluru?languages=hindi',
                'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': "Linux",
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                }
        return headers

    def _get_chrome_driver(self):
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver = webdriver.Chrome()
        driver.get(self.path_movie_list_url)
        
        # Scroll to buttom slowly to load all movies
        driver.execute_async_script(
            """
        count = 400;
        let callback = arguments[arguments.length - 1];
        t = setTimeout(function scrolldown(){
            console.log(count, t);
            window.scrollTo(0, count);
            if(count < (document.body.scrollHeight || document.documentElement.scrollHeight)){
              count+= 400;
              t = setTimeout(scrolldown, 1000);
            }else{
              callback((document.body.scrollHeight || document.documentElement.scrollHeight));
            }
        }, 1000);"""
        )
        return driver
    
    def parse_drive_to_soup(self, driver):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return soup
    
    def get_all_current_movie_list(self, soup):
        
        movie_soup_element_dict = {}
        
        movie_tags_list = [a for a in soup.find_all('a', href=True) if self.movie_ahref_string in a['href']]
        
        # Extract movie names from the URLs
        movie_names = []
        for link in movie_tags_list:
            # Extract the movie name from the URL
            movie_name_match = re.search(r'.*/movies/(.*)/ET.*', link['href'])
            if movie_name_match:
                movie_name = movie_name_match.group(1)
                movie_names.append(movie_name)

                movie_url = link['href']
                movie_url_basename = movie_url.split('/')[-1]
                
                movie_booking_url = f'https://in.bookmyshow.com/buytickets/{movie_name}-{self.location}/movie-bang-{movie_url_basename}-MT/{self.date}'

                movie_soup_element_dict[movie_name] = {
                    # 'element': link,
                    'url': movie_url,
                    'name': movie_name,
                    'booking_url' : movie_booking_url,
                    'date': self.date,
                    'location' : self.location,
                }
            
        # # Print the movie names
        # for movie_name in movie_names:
        #     print("Movie Name:", movie_name)

        
        print(f'Number of movies running: {len(movie_names)}')
        return movie_soup_element_dict
        
    def generate(self):
        driver = self._get_chrome_driver()
        soup = self.parse_drive_to_soup(driver)
        movie_soup_element_dict  = self.get_all_current_movie_list(soup)
        
        return movie_soup_element_dict

In [None]:
for location in location_list:
    print(f'Processing location: {location}')
    # location = 'national-capital-region-ncr'
    # Get the current date
    current_date = datetime.now()
    # Format the date in YYYYMMDD format
    date = current_date.strftime("%Y%m%d")
    # Get the current time in seconds since the epoch
    current_time_seconds = time.time()

    # Convert seconds since the epoch to a time structure
    time_struct = time.localtime(current_time_seconds)

    # Format the time in 24-hour format (HH:MM:SS)
    formatted_time = time.strftime("%H:%M", time_struct)

    path_movie_list_url = f'https://in.bookmyshow.com/explore/movies-{location}'
    path_output_dir = 'data/'

    #######################
    path_output_data_csv = os.path.join(path_output_dir, f'{location}/{date}/data_{location}_{date}.csv')
    path_output_data_json = os.path.join(path_output_dir, f'{location}/{date}/data_{location}_{date}.json')

    print(f'path_output_data_csv: {path_output_data_csv}')
    
    #######################
    movie_list_instance = GetMoviesList(path_movie_list_url=path_movie_list_url, location=location, date=date)
    movie_soup_element_dict = movie_list_instance.generate()
    
    util.check_dir(os.path.dirname(path_output_data_json))
    util.save_json(movie_soup_element_dict, path_output=path_output_data_json)
    
    
    #######################
    data_movie_list = []

    for movie_name in tqdm(movie_soup_element_dict):
        path_movie_url = movie_soup_element_dict[movie_name]['booking_url']

        movie_details = GetMovieDetails(path_movie_url=path_movie_url,
                                        location=location, date=date,
                                        movie_name=movie_name,
                                        
                                        )
        movie_detail_df = movie_details.get_processed_df()
        data_movie_list.append(movie_detail_df)


    data_movie_list_pd = pd.concat(data_movie_list)
    
    
    if os.path.exists(path_output_data_csv):
        data_movie_list_pd.to_csv(path_output_data_csv, index=False, mode='a', header=False)
    else:
        data_movie_list_pd.to_csv(path_output_data_csv, index=False, mode='a', header=True)

In [10]:
location = 'bengaluru'

current_date = datetime.now()
# Format the date in YYYYMMDD format
date = current_date.strftime("%Y%m%d")
# Get the current time in seconds since the epoch
current_time_seconds = time.time()

# Convert seconds since the epoch to a time structure
time_struct = time.localtime(current_time_seconds)

# Format the time in 24-hour format (HH:MM:SS)
formatted_time = time.strftime("%H:%M", time_struct)

path_movie_list_url = f'https://in.bookmyshow.com/explore/movies-{location}'
path_output_dir = 'data/'

#######################
path_output_data_csv = os.path.join(path_output_dir, f'{location}/{date}/data_{location}_{date}.csv')
path_output_data_json = os.path.join(path_output_dir, f'{location}/{date}/data_{location}_{date}.json')

print(f'path_output_data_csv: {path_output_data_csv}')

path_output_data_csv: data/bengaluru/20230917/data_bengaluru_20230917.csv


In [11]:
# util.check_dir(os.path.dirname(path_output_data_json))
# util.save_json(movie_soup_element_dict, path_output=path_output_data_json)

movie_soup_element_dict = util.load_json(path_output_data_json)

In [None]:
data_movie_list = []

for movie_name in tqdm(movie_soup_element_dict):
    path_movie_url = movie_soup_element_dict[movie_name]['booking_url']

    movie_details = GetMovieDetails(path_movie_url=path_movie_url,
                                    location=location, date=date,
                                    movie_name=movie_name,
                                    
                                    )
    movie_detail_df = movie_details.get_processed_df()
    data_movie_list.append(movie_detail_df)


data_movie_list_pd = pd.concat(data_movie_list)

In [None]:
# if os.path.exists(path_output_data_csv):
#     data_movie_list_pd.to_csv(path_output_data_csv, index=False, mode='a', header=False)
# else:
#     data_movie_list_pd.to_csv(path_output_data_csv, index=False, mode='a', header=True)

In [None]:
Filters - 
1. location
2. Movie
3. Date 
4. Time of day

Stats - 
1. Number of Movies. 
2. Number of Theaters 
3. Occupancy Chart 

Times Series - 
1. Number of Shows per day
2. Price of tickets per day
3. Occupancy rate per day


In [37]:
import importlib

importlib.reload(data_loader)


<module 'src.data_utils.data_loader' from '/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/src/data_utils/data_loader.py'>

In [50]:
from config import params
importlib.reload(params)


<module 'config.params' from '/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/config/params.py'>

In [79]:
movie_city = 'national-capital-region-ncr'
movie_name = 'jawan'

path_data_dir = '/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/data'

config = params.MovieConfig()

movie_data_loader = data_loader.MovieDataLoder(path_data_dir)
city_df = movie_data_loader.get_city_data(city=movie_city)

In [None]:
city_df = movie_data_loader.get_city_data(movie_city)
movie_city_df = city_df[city_df['movie_name'] == movie_name]

movie_df_one_hour = movie_city_df[movie_city_df['difference'] == 0]
movie_df_one_hour['status'] = movie_df_one_hour['avail_status'].apply(lambda x: config.mapping_available_status.get(x))
movie_df_one_hour['count'] = 1

movie_df_one_hour_percentage = movie_df_one_hour.groupby(['status', 'Date']).agg({'count': 'sum'}).reset_index()
total_cinema_df = movie_df_one_hour.groupby(['Date']).agg({'count': 'sum'}).reset_index()
total_cinema_df = total_cinema_df.rename(columns={'count': 'total_cinema_count'})

movie_df_one_hour_percentage = pd.merge(movie_df_one_hour_percentage, total_cinema_df, on='Date', how='left')
movie_df_one_hour_percentage['percentage'] = movie_df_one_hour_percentage.apply(lambda x: x['count']*100//x['total_cinema_count'], axis=1)


In [80]:
date = '20230917'

In [81]:
date_city_df = movie_data_loader.get_date_data(data_df=city_df, date=date)


In [82]:
data_movie_screen = movie_data_loader.get_movie_screen_count(
            city_df=date_city_df)
data_movie_screen

Unnamed: 0,Movies,Number of Shows,Date
3,JAWAN,46,2023-09-17
4,MARK-ANTONY,28,2023-09-17
8,THE-NUN-II,12,2023-09-17
5,MISS-SHETTY-MR-POLISHETTY,10,2023-09-17
0,A-HAUNTING-IN-VENICE,7,2023-09-17
7,THE-EQUALIZER-3,6,2023-09-17
6,ROCKY-AUR-RANI-KII-PREM-KAHAANI,4,2023-09-17
1,BARBIE,3,2023-09-17
2,DREAM-GIRL-2,2,2023-09-17


In [84]:
city_df = movie_data_loader.get_city_data(movie_city)

date_city_df = movie_data_loader.get_date_data(data_df=city_df, date=date)

cinema_title_list = movie_data_loader.get_label_list(
    date_city_df, label_column='cinema_title')

movie_list = movie_data_loader.get_label_list(
    date_city_df, label_column='movie_name')

In [86]:
movie_list

['jawan',
 'miss-shetty-mr-polishetty',
 'mark-antony',
 'a-haunting-in-venice',
 'the-nun-ii',
 'the-equalizer-3',
 'rocky-aur-rani-kii-prem-kahaani',
 'dream-girl-2',
 'barbie']

In [None]:
# # for holding the resultant list
# element_list = []
  
# for page in range(1, 3, 1):
    
#     page_url = "https://webscraper.io/test-sites/e-commerce/static/computers/laptops?page=" + str(page)
#     driver = webdriver.Chrome(ChromeDriverManager().install())
#     driver.get(page_url)
#     title = driver.find_elements_by_class_name("title")
#     price = driver.find_elements_by_class_name("price")
#     description = driver.find_elements_by_class_name("description")
#     rating = driver.find_elements_by_class_name("ratings")
  
#     for i in range(len(title)):
#         element_list.append([title[i].text, price[i].text, description[i].text, rating[i].text])
  
# print(element_list)
  
# #closing the driver
# driver.close()

### User Proxy

In [57]:
import requests
 
# use to parse html text
from lxml.html import fromstring 
from itertools import cycle
import traceback
 
 
def to_get_proxies():
    # website to get free proxies
    url = 'https://free-proxy-list.net/' 
 
    response = requests.get(url)
 
    parser = fromstring(response.text)
    # using a set to avoid duplicate IP entries.
    proxies = set() 
 
    for i in parser.xpath('//tbody/tr')[:10]:
 
        # to check if the corresponding IP is of type HTTPS
        if i.xpath('.//td[7][contains(text(),"yes")]'):
 
            # Grabbing IP and corresponding PORT
            proxy = ":".join([i.xpath('.//td[1]/text()')[0],
                              i.xpath('.//td[2]/text()')[0]])
 
            proxies.add(proxy)
        return proxies

In [None]:
proxies = to_get_proxies()
 
# to rotate through the list of IPs
proxyPool = cycle(proxies) 
 
# insert the url of the website you want to scrape.
url = '' 
 
for i in range(1, 11):
 
    # Get a proxy from the pool
    proxy = next(proxyPool)
    print("Request #%d" % i)
 
    try:
        response = requests.get(url, proxies={"http": proxy, "https": proxy})
        print(response.json())
 
    except:
       
        # One has to try the entire process as most
        # free proxies will get connection errors
        # We will just skip retries.
        print("Skipping.  Connection error")

In [61]:
from fake_useragent import UserAgent
ua = UserAgent()

In [64]:
print(ua['google chrome'])


Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36
