## Install the Kaggle python library

In [None]:
%pip install --q kaggle 


## Authenticate with the API
To authenticate with the API, you'll need to generate a key in your Kaggle account Settings page. 

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os

os.chdir('/lakehouse/default/Files')

# you will need to add your Kaggle Username and Kaggle key in the following lines:
os.environ['KAGGLE_USERNAME'] = ''
os.environ['KAGGLE_KEY'] = ''
api = KaggleApi()
api.authenticate()



## Get data from API, unzip, load into Spark DF and load into Lakehouse table

In [None]:
import zipfile

datasets = {'review': "business_id STRING, cool INTEGER, date TIMESTAMP, funny INTEGER, review_id STRING, stars FLOAT, text STRING, useful INTEGER, user_id STRING",
    'user': '''user_id string, name string, review_count integer, yelping_since string, useful integer, 
            funny integer, cool integer, elite string, friends string''' ,
    'business':'''business_id string, name string, address string, city string, state string, 
            postal_code string, latitude float, longitutde float, stars integer, review_count integer,
            is_open boolean, attributes string, categories string, hours string '''} 

for dataset_name in datasets.keys(): 
    #download file as zip
    api.dataset_download_file('yelp-dataset/yelp-dataset', f'yelp_academic_dataset_{dataset_name}.json')
    
    #unzip the file 
    with zipfile.ZipFile(f'/lakehouse/default/Files/yelp_academic_dataset_{dataset_name}.json.zip', 'r') as zip_ref:
        zip_ref.extractall('/lakehouse/default/Files/')
    
    # read file into spark df with schema
    df = spark.read.schema(datasets[dataset_name]).json(f'Files/yelp_academic_dataset_{dataset_name}.json')
    
    # save spark df as lakehouse table
    df.write.format('delta').save(f'Tables/{dataset_name}')
