## Request Raw Data from TMDB API

### Description
This process involves calling the top 40 most popular movies at the time from the TMDB API. The response is then parsed and loaded into an Azure Data Lake Storage (ADLS) Gen2 account as raw data.

### Requirements
which have already been handled by my Terraform code:

- Libraries installed
- Mount point to the storage account
- TMDB API key added in Databricks Secret 

In [0]:
import datetime
import os

import pandas as pd
import requests

In [0]:
current_date = datetime.date.today()
fm_current_date = current_date.strftime('%d-%m-%Y')
base_path = '/mnt/movrec-container/raw' 

In [0]:
def fetch_movie_data(page):
    TMDB_KEY = dbutils.secrets.get(scope="application", key="tmdb_api_key")
    url = "https://api.themoviedb.org/3/discover/movie"
    params = {
        "include_adult": "true",
        "include_video": "false",
        "language": "en-US",
        "page": page,
        "sort_by": "popularity.desc",
    }
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {TMDB_KEY}"
    }

    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()

    return response.json()['results']

In [0]:
pages_to_fetch = [1, 2]
dataframes = [pd.DataFrame(fetch_movie_data(page)) for page in pages_to_fetch]
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df['date'] = fm_current_date

In [0]:
path= os.path.join(base_path, str(current_date.year), str(current_date.month))
try:
    dbutils.fs.ls(path)
except Exception as e:
    dbutils.fs.mkdirs(path)

### Notice when work with dbfs files
- **/dbfs** is ONLY and MUST accepted when specify WRITE path 
- **dbfs:** is legal format in databricks when work with dbfs files, but it is optional

In [0]:
merged_df.to_parquet(f"/dbfs{path}/{fm_current_date}.parquet", index=False)

  if _pandas_api.is_sparse(col):
