In [1]:
import os
import re
os.chdir('../')

In [2]:
import json
import time
import requests
from datetime import datetime

import numpy as np
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
  



In [3]:
from src.general_utils import util

In [4]:
cities = [
    'bengaluru',
    # 'national-capital-region-ncr',
]

In [5]:
class GetMovieDetails:
    def __init__(self, path_movie_url, movie_name, date, location, if_debug=False):
        
        self.path_movie_url = path_movie_url   
        self.movie_name = movie_name 
        self.date = date
        self.location = location
        self.formatted_time = self._get_current_time()
        self.if_debug = if_debug

    def _get_current_time(self):
        # Get the current time in seconds since the epoch
        current_time_seconds = time.time()

        # Convert seconds since the epoch to a time structure
        time_struct = time.localtime(current_time_seconds)

        # Format the time in 24-hour format (HH:MM:SS)
        formatted_time = time.strftime("%H:%M", time_struct)
        return formatted_time
    
    def _get_header(self):
        headers = {
                'Referer': 'https://in.bookmyshow.com/explore/movies-bengaluru?languages=hindi',
                'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': "Linux",
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                }
        return headers
        
    
    def _get_chrome_driver(self):
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver = webdriver.Chrome()
        driver.get(self.path_movie_url)
        return driver

    
    def convert_to_json(self, data_categories:str):
        return json.loads(data_categories)

    def get_price(self, data_json):
        return data_json.get('price')

    def get_desc(self, data_json):
        return data_json.get('desc')

    def get_availability_class(self, data_json):
        return data_json.get('availabilityClass')

    def get_availStatus(self, data_json):
        return data_json.get('availStatus')

    def get_processed_df(self):
        driver = self._get_chrome_driver()
        
        if self.if_debug : print(driver)
        theatre_element_list = driver.find_elements(By.CLASS_NAME, 'list')
        if self.if_debug : print(f'Number of theatres : {len(theatre_element_list)}')

        data = []

        for i, a in tqdm(enumerate(theatre_element_list)):
            try:
                titles = a.find_element(By.CLASS_NAME, '__title')
                listing_info = a.find_elements(By.CLASS_NAME, 'listing-info')
                showtimes = a.find_elements(By.CLASS_NAME, 'showtime-pill')

                for s in showtimes:

                    data_categories = self.convert_to_json(s.get_attribute('data-categories'))
                    timings = s.get_attribute('data-date-time')

                    temp_data = {
                        'cinema_title' : titles.text.split('\nINFO')[0],
                        'timings': timings,
                        'latitude': a.get_attribute('data-lat'),
                        'longtitude' : a.get_attribute('data-lng'),

                        'price': self.get_price(data_categories),
                        'description': self.get_desc(data_categories),

                        'availability_class': self.get_availability_class(data_categories),
                        'avail_status': self.get_availStatus(data_categories),
                        'movie_name' : self.movie_name,
                        'date' : self.date,
                        'location' : self.location,
                        'checked_time' : self.formatted_time,
                    }
                    
                    data.append(temp_data)

                # b.find_element('showtime-pill')
            except:
                if self.if_debug : print(i)

        
        data_df = pd.DataFrame(data)

        return data_df
        

In [6]:
class GetMoviesList:
    def __init__(self, path_movie_list_url, location, date):
        self.date = date
        self.location = location
        self.path_movie_list_url = path_movie_list_url    
        self.movie_ahref_string = f'https://in.bookmyshow.com/{location}/movies'

    def _get_header(self):
        headers = {
                'Referer': 'https://in.bookmyshow.com/explore/movies-bengaluru?languages=hindi',
                'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': "Linux",
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                }
        return headers

    def _get_chrome_driver(self):
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver = webdriver.Chrome()
        driver.get(self.path_movie_list_url)
        
        # Scroll to buttom slowly to load all movies
        driver.execute_async_script(
            """
        count = 400;
        let callback = arguments[arguments.length - 1];
        t = setTimeout(function scrolldown(){
            console.log(count, t);
            window.scrollTo(0, count);
            if(count < (document.body.scrollHeight || document.documentElement.scrollHeight)){
              count+= 400;
              t = setTimeout(scrolldown, 1000);
            }else{
              callback((document.body.scrollHeight || document.documentElement.scrollHeight));
            }
        }, 1000);"""
        )
        return driver
    
    def parse_drive_to_soup(self, driver):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return soup
    
    def get_all_current_movie_list(self, soup):
        
        movie_soup_element_dict = {}
        
        movie_tags_list = [a for a in soup.find_all('a', href=True) if self.movie_ahref_string in a['href']]
        
        # Extract movie names from the URLs
        movie_names = []
        for link in movie_tags_list:
            # Extract the movie name from the URL
            movie_name_match = re.search(r'.*/movies/(.*)/ET.*', link['href'])
            if movie_name_match:
                movie_name = movie_name_match.group(1)
                movie_names.append(movie_name)

                movie_url = link['href']
                movie_url_basename = movie_url.split('/')[-1]
                
                movie_booking_url = f'https://in.bookmyshow.com/buytickets/{movie_name}-{self.location}/movie-bang-{movie_url_basename}-MT/{self.date}'

                movie_soup_element_dict[movie_name] = {
                    # 'element': link,
                    'url': movie_url,
                    'name': movie_name,
                    'booking_url' : movie_booking_url,
                    'date': self.date,
                    'location' : self.location,
                }
            
        # # Print the movie names
        # for movie_name in movie_names:
        #     print("Movie Name:", movie_name)

        
        print(f'Number of movies running: {len(movie_names)}')
        return movie_soup_element_dict
        
    def generate(self):
        driver = self._get_chrome_driver()
        soup = self.parse_drive_to_soup(driver)
        movie_soup_element_dict  = self.get_all_current_movie_list(soup)
        
        return movie_soup_element_dict

In [137]:
location = 'bhopal'
# Get the current date
current_date = datetime.now()
# Format the date in YYYYMMDD format
date = current_date.strftime("%Y%m%d")
# Get the current time in seconds since the epoch
current_time_seconds = time.time()

# Convert seconds since the epoch to a time structure
time_struct = time.localtime(current_time_seconds)

# Format the time in 24-hour format (HH:MM:SS)
formatted_time = time.strftime("%H:%M", time_struct)

path_movie_list_url = f'https://in.bookmyshow.com/explore/movies-{location}'
path_output_dir = 'data/'

path_output_data_csv = os.path.join(path_output_dir, f'{location}/{date}/data_{location}_{date}.csv')
path_output_data_json = os.path.join(path_output_dir, f'{location}/{date}/data_{location}_{date}.json')

print(f'path_output_data_csv: {path_output_data_csv}')

path_output_data_csv: data/bhopal/20230916/data_bhopal_20230916.csv


In [138]:
movie_list_instance = GetMoviesList(path_movie_list_url=path_movie_list_url, location=location, date=date)
movie_soup_element_dict = movie_list_instance.generate()

Number of movies running: 6


In [139]:
util.save_json(movie_soup_element_dict, path_output=path_output_data_json)

In [140]:
data_movie_list = []

for movie_name in tqdm(movie_soup_element_dict):
    path_movie_url = movie_soup_element_dict[movie_name]['booking_url']

    movie_details = GetMovieDetails(path_movie_url=path_movie_url,
                                    location=location, date=date,
                                    movie_name=movie_name,
                                    
                                    )
    movie_detail_df = movie_details.get_processed_df()
    data_movie_list.append(movie_detail_df)


data_movie_list_pd = pd.concat(data_movie_list)

97it [00:06, 14.56it/s]0:00<?, ?it/s]
39it [00:01, 23.21it/s]0:11<00:57, 11.45s/it]
10it [00:00, 68.57it/s]0:16<00:31,  7.94s/it]
13it [00:00, 51.17it/s]0:22<00:20,  6.69s/it]
84it [00:03, 22.24it/s]0:27<00:12,  6.35s/it]
11it [00:00, 55.61it/s]0:35<00:06,  6.74s/it]
100%|██████████| 6/6 [00:38<00:00,  6.50s/it]


In [146]:
if os.path.exists(path_output_data_csv):
    data_movie_list_pd.to_csv(path_output_data_csv, index=False, mode='a', header=False)
else:
    data_movie_list_pd.to_csv(path_output_data_csv, index=False, mode='a', header=True)

In [None]:
Filters - 
1. location
2. Movie
3. Date 
4. Time of day

Stats - 
1. Number of Movies. 
2. Number of Theaters 
3. Occupancy Chart 

Times Series - 
1. Number of Shows per day
2. Price of tickets per day
3. Occupancy rate per day


In [237]:
from src.data_utils import data_loader
import importlib

importlib.reload(data_loader)

<module 'src.data_utils.data_loader' from '/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/src/data_utils/data_loader.py'>

In [238]:
city = 'bengaluru'
path_data_dir = '/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/data'

movie_data_loader = data_loader.MovieDataLoder(path_data_dir)
city_df = movie_data_loader.get_city_data(city=city)

In [239]:
movie_data_loader.city_date_csv_mapping

{'bengaluru': {'20230915': ['/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/data/bengaluru/20230915/data_bengaluru_20230915.csv'],
  '20230916': ['/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/data/bengaluru/20230916/data_bengaluru_20230916.csv']},
 'bhopal': {'20230915': ['/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/data/bhopal/20230915/data_bhopal_20230915.csv'],
  '20230916': ['/Users/manish.sahu/Downloads/tiler/scrap-bookmyshow/data/bhopal/20230916/data_bhopal_20230916.csv']}}

In [240]:
current_date = datetime.now()
# Format the date in YYYYMMDD format
date = str(current_date.strftime("%Y%m%d"))
# Get the current time in seconds since the epoch
date

'20230916'

In [241]:
city_df

Unnamed: 0,cinema_title,timings,latitude,longtitude,price,description,availability_class,avail_status,movie_name,date,location,checked_time,difference,Date
0,Akash Cinemas: Laggere,10:30 AM,13.0145,77.5160,100.0,Silver Class,_available,3,tatsama-tadbhava,20230915,bengaluru,01:39,8,2023-09-15
1,Akash Cinemas: Laggere,01:30 PM,13.0145,77.5160,100.0,Silver Class,_available,3,tatsama-tadbhava,20230915,bengaluru,01:39,11,2023-09-15
2,Cinephile HSR Layout: PNR Felicity Mall Haralu...,10:15 AM,12.8963,77.6579,236.0,GOLD,_available,3,tatsama-tadbhava,20230915,bengaluru,01:39,8,2023-09-15
3,Cinephile HSR Layout: PNR Felicity Mall Haralu...,05:15 PM,12.8963,77.6579,236.0,GOLD,_available,3,tatsama-tadbhava,20230915,bengaluru,01:39,15,2023-09-15
4,Cinepolis: Binnypet Mall,11:05 AM,12.9676,77.5584,150.0,PREMIUM,_available,3,tatsama-tadbhava,20230915,bengaluru,01:39,9,2023-09-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2855,"INOX:SBR Horizon, Seegehalli Whitefield-Hoskot...",10:10 PM,13.0183,77.7625,220.0,EXECUTIVE,_available,3,kasargold,20230916,bengaluru,15:41,6,2023-09-16
2856,"PVR: MSR Elements Mall, Tanisandhra Main Road",06:45 PM,13.0452,77.6266,490.0,RECLINER,_available,3,kasargold,20230916,bengaluru,15:41,3,2023-09-16
2857,"PVR: Nexus (Formerly Forum), Koramangala",07:00 PM,12.9346,77.6111,530.0,RECLINER,_filling,2,kasargold,20230916,bengaluru,15:41,3,2023-09-16
2858,"PVR: Phoenix Marketcity Mall, Whitefield Road",07:10 PM,12.9973,77.6957,460.0,RECLINER,_available,3,kasargold,20230916,bengaluru,15:41,3,2023-09-16


In [242]:
jawan_df = city_df[city_df['movie_name'] == 'jawan']
jawan_df

Unnamed: 0,cinema_title,timings,latitude,longtitude,price,description,availability_class,avail_status,movie_name,date,location,checked_time,difference,Date
81,"Alankar Theater Hoskote Dolby Atmos, 2K Laser",04:00 PM,13.0784,77.7867,100.0,Gold,_available,3,jawan,20230915,bengaluru,01:39,14,2023-09-15
82,"Alankar Theater Hoskote Dolby Atmos, 2K Laser",07:00 PM,13.0784,77.7867,100.0,Gold,_available,3,jawan,20230915,bengaluru,01:39,17,2023-09-15
83,Anjan Digital 4K A/C Cinema: Magadi Road,07:05 PM,12.9754,77.5589,100.0,SECOND CLASS,_available,3,jawan,20230915,bengaluru,01:39,17,2023-09-15
84,Anjan Digital 4K A/C Cinema: Magadi Road,10:25 PM,12.9754,77.5589,100.0,SECOND CLASS,_available,3,jawan,20230915,bengaluru,01:39,20,2023-09-15
85,"Ashoka Theatre: Chikkabanavara, Laser Digital",04:00 PM,13.0807,77.5011,100.0,First Class,_available,3,jawan,20230915,bengaluru,01:39,14,2023-09-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2442,Victory Cinema Barco-4K RGB-Laser: Kamakshipalya,03:30 PM,12.9830,77.5302,195.0,PREMIUM,_available,3,jawan,20230916,bengaluru,15:40,23,2023-09-16
2443,Victory Cinema Barco-4K RGB-Laser: Kamakshipalya,07:00 PM,12.9830,77.5302,195.0,PREMIUM,_available,3,jawan,20230916,bengaluru,15:40,3,2023-09-16
2444,Victory Cinema Barco-4K RGB-Laser: Kamakshipalya,10:30 PM,12.9830,77.5302,195.0,PREMIUM,_available,3,jawan,20230916,bengaluru,15:40,6,2023-09-16
2445,Vinayaka Cinemas 4K Dolby 11.5 A/C 3D: Harinagar,04:00 PM,12.8751,77.5699,120.0,DIAMOMD,_available,3,jawan,20230916,bengaluru,15:40,0,2023-09-16


In [265]:
movie_df_one_hour.groupby(['Date']).agg({'count': 'sum'}).reset_index()

Unnamed: 0,Date,count
0,2023-09-15,44
1,2023-09-16,44


In [273]:
movie_city_df = city_df[city_df['movie_name'] == 'jawan']

movie_df_one_hour = movie_city_df[movie_city_df['difference'] == 0]
movie_df_one_hour['status'] = movie_df_one_hour['avail_status'].apply(lambda x: mapping_avail_status.get(x))
movie_df_one_hour['count'] = 1

movie_df_one_hour_percentage = movie_df_one_hour.groupby(['status', 'Date']).agg({'count': 'sum'}).reset_index()
total_cinema_df = movie_df_one_hour.groupby(['Date']).agg({'count': 'sum'}).reset_index()
total_cinema_df = total_cinema_df.rename(columns={'count': 'total_cinema_count'})

movie_df_one_hour_percentage = pd.merge(movie_df_one_hour_percentage, total_cinema_df, on='Date', how='left')
movie_df_one_hour_percentage['percentage'] = movie_df_one_hour_percentage.apply(lambda x: x['count']*100//x['total_cinema_count'], axis=1)
movie_df_one_hour_percentage




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,status,Date,count,total_cinema_count,percentage
0,high_available,2023-09-15,25,44,56
1,high_available,2023-09-16,25,44,56
2,low_availablity,2023-09-15,6,44,13
3,low_availablity,2023-09-16,6,44,13
4,medium_available,2023-09-15,8,44,18
5,medium_available,2023-09-16,8,44,18
6,sold,2023-09-15,5,44,11
7,sold,2023-09-16,5,44,11


In [251]:
mapping_avail_status = {
    0: 'sold',
    1: 'low_availablity',
    2: 'medium_available',
    3: 'high_available',
}


jawan_df_one_hour = jawan_df[jawan_df['difference'] == 0]
n = len(jawan_df_one_hour)

jawan_df_one_hour['status'] = jawan_df_one_hour['avail_status'].apply(lambda x: mapping_avail_status.get(x))
jawan_df_one_hour['count'] = 1

jawan_df_one_hour_percentage = jawan_df_one_hour.groupby(['status', 'Date']).agg({'count': 'sum'}).reset_index()
n = jawan_df_one_hour_percentage['count'].sum()
jawan_df_one_hour_percentage['percentage'] = jawan_df_one_hour_percentage['count'].apply(lambda x: x*100//n)
jawan_df_one_hour_percentage



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,status,Date,count,percentage
0,high_available,2023-09-15,25,28
1,high_available,2023-09-16,25,28
2,low_availablity,2023-09-15,6,6
3,low_availablity,2023-09-16,6,6
4,medium_available,2023-09-15,8,9
5,medium_available,2023-09-16,8,9
6,sold,2023-09-15,5,5
7,sold,2023-09-16,5,5


In [252]:
jawan_df_one_hour_percentage

Unnamed: 0,status,Date,count,percentage
0,high_available,2023-09-15,25,28
1,high_available,2023-09-16,25,28
2,low_availablity,2023-09-15,6,6
3,low_availablity,2023-09-16,6,6
4,medium_available,2023-09-15,8,9
5,medium_available,2023-09-16,8,9
6,sold,2023-09-15,5,5
7,sold,2023-09-16,5,5


In [250]:
jawan_df_one_hour_percentage.pivot(index='Date', columns='status', values='percentage').reset_index()


status,Date,high_available,low_availablity,medium_available,sold
0,2023-09-15,28,6,9,5
1,2023-09-16,28,6,9,5


In [222]:
import plotly.express as px


In [226]:
# Sample data
data = {
    'Category': ['A', 'B', 'C'],
    'Value1': [10, 15, 12],
    'Value2': [18, 20, 16],
    'Value3': [25, 22, 24]
}

df = pd.DataFrame(data)

# Melt the DataFrame to long format for multiple bars per category
df_melted = pd.melt(df, id_vars='Category', value_vars=['Value1', 'Value2', 'Value3'],
                    var_name='Value', value_name='Count')

# Create a bar plot
fig = px.bar(df_melted, x='Category', y='Count', color='Value',
             barmode='group', # Change to 'stack' for stacked bars
             title='Multiple Bars per Category',
             labels={'Count': 'Value'})

# Show the plot
fig.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [228]:
df

Unnamed: 0,Category,Value1,Value2,Value3
0,A,10,18,25
1,B,15,20,22
2,C,12,16,24


In [227]:
df_melted

Unnamed: 0,Category,Value,Count
0,A,Value1,10
1,B,Value1,15
2,C,Value1,12
3,A,Value2,18
4,B,Value2,20
5,C,Value2,16
6,A,Value3,25
7,B,Value3,22
8,C,Value3,24


In [12]:
# # for holding the resultant list
# element_list = []
  
# for page in range(1, 3, 1):
    
#     page_url = "https://webscraper.io/test-sites/e-commerce/static/computers/laptops?page=" + str(page)
#     driver = webdriver.Chrome(ChromeDriverManager().install())
#     driver.get(page_url)
#     title = driver.find_elements_by_class_name("title")
#     price = driver.find_elements_by_class_name("price")
#     description = driver.find_elements_by_class_name("description")
#     rating = driver.find_elements_by_class_name("ratings")
  
#     for i in range(len(title)):
#         element_list.append([title[i].text, price[i].text, description[i].text, rating[i].text])
  
# print(element_list)
  
# #closing the driver
# driver.close()