#Importing Libraries

In [None]:
import pandas as pd
from datetime import datetime, timedelta

!pip install boxoffice_api
from boxoffice_api import BoxOffice
import time

# Initialize BoxOffice API
box = BoxOffice(outputformat= "DF")

Collecting boxoffice_api
  Downloading boxoffice_api-1.2.2-py3-none-any.whl.metadata (5.2 kB)
Collecting bs4 (from boxoffice_api)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading boxoffice_api-1.2.2-py3-none-any.whl (7.3 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, boxoffice_api
Successfully installed boxoffice_api-1.2.2 bs4-0.0.2


#Pulling Box Office Data

We start from the year 2009 as it was the release year of the first Avatar movie.

In [None]:
year = 2009
# Define date range
start_date = datetime(year, 1, 1)
end_date = datetime(year, 12, 31)
# Prepare storage
all_days = []
# Loop through each date
current_date = start_date
while current_date <= end_date:
  date_str = current_date.strftime("%Y-%m-%d")
  try:
    df = box.get_daily(date=date_str)
    df["date"] = date_str # add date column
    all_days.append (df)
    print(f" Pulled data for {date_str} ({len (df)} records)")
  except Exception as e:
    print (f"Skipped {date_str}: {e}")
  time.sleep(1.5)  # avoid rate limiting (adjust if needed)
  current_date += timedelta(days=1)

 Pulled data for 2009-01-01 (51 records)
 Pulled data for 2009-01-02 (47 records)
 Pulled data for 2009-01-03 (47 records)
 Pulled data for 2009-01-04 (47 records)
 Pulled data for 2009-01-05 (46 records)
 Pulled data for 2009-01-06 (46 records)
 Pulled data for 2009-01-07 (47 records)
 Pulled data for 2009-01-08 (48 records)
 Pulled data for 2009-01-09 (47 records)
 Pulled data for 2009-01-10 (47 records)
 Pulled data for 2009-01-11 (47 records)
 Pulled data for 2009-01-12 (45 records)
 Pulled data for 2009-01-13 (45 records)
 Pulled data for 2009-01-14 (45 records)
 Pulled data for 2009-01-15 (45 records)
 Pulled data for 2009-01-16 (49 records)
 Pulled data for 2009-01-17 (49 records)
 Pulled data for 2009-01-18 (49 records)
 Pulled data for 2009-01-19 (54 records)
 Pulled data for 2009-01-20 (47 records)
 Pulled data for 2009-01-21 (47 records)
 Pulled data for 2009-01-22 (48 records)
 Pulled data for 2009-01-23 (48 records)
 Pulled data for 2009-01-24 (48 records)
 Pulled data for

In [None]:
#Data is saved into a csv file
combined_df = pd.concat(all_days, ignore_index=True)
combined_df.to_csv('box_office2009_data.csv', index=False)
print('Data successfully saved to box_office2009_data.csv')

Data successfully saved to box_office2009_data.csv


Once verified, we loop the same code through the following years of 2010-2025.

In [None]:
# Loop through years 2010â€“2025
all_years_data = []

for year in range(2010, 2026):
    print(f"\n=== Fetching data for {year} ===")

    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)

    all_days = []

    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime("%Y-%m-%d")
        try:
            df = box.get_daily(date=date_str)
            df["date"] = date_str
            all_days.append(df)
            print(f"Pulled data for {date_str} ({len(df)} records)")
        except Exception as e:
            print(f"Skipped {date_str}: {e}")

        time.sleep(1.5)
        current_date += timedelta(days=1)

    # Combine per-year DataFrame
    if all_days:
        year_df = pd.concat(all_days, ignore_index=True)
        all_years_data.append(year_df)
        print(f"Combined {len(year_df)} rows for {year}")
    else:
        print(f"No data collected for {year}")

# Final combined dataset
if all_years_data:
    final_df = pd.concat(all_years_data, ignore_index=True)
    print(f"\n=== Final dataset shape: {final_df.shape} ===")
else:
    final_df = pd.DataFrame()
    print("\nNo data collected across all years.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Pulled data for 2012-09-11 (51 records)
Pulled data for 2012-09-12 (51 records)
Pulled data for 2012-09-13 (51 records)
Pulled data for 2012-09-14 (56 records)
Pulled data for 2012-09-15 (56 records)
Pulled data for 2012-09-16 (56 records)
Pulled data for 2012-09-17 (52 records)
Pulled data for 2012-09-18 (52 records)
Pulled data for 2012-09-19 (52 records)
Pulled data for 2012-09-20 (51 records)
Pulled data for 2012-09-21 (57 records)
Pulled data for 2012-09-22 (57 records)
Pulled data for 2012-09-23 (57 records)
Pulled data for 2012-09-24 (53 records)
Pulled data for 2012-09-25 (53 records)
Pulled data for 2012-09-26 (54 records)
Pulled data for 2012-09-27 (55 records)
Pulled data for 2012-09-28 (59 records)
Pulled data for 2012-09-29 (57 records)
Pulled data for 2012-09-30 (56 records)
Pulled data for 2012-10-01 (51 records)
Pulled data for 2012-10-02 (52 records)
Pulled data for 2012-10-03 (53 records)
Pulled data for

In [None]:
final_df.to_csv('box_office_data.csv', index=False)
print('Data successfully saved to box_office_data.csv')

Data successfully saved to box_office_data.csv


Both the csv files for 2009 and 2010-2025 were merged in Excel later and uploaded on Google Drive for further workings.