# Download datasets

## 0 Global settings

In [1]:
import sys
import pandas as pd
import requests
import os

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"requests version: {requests.__version__}")
print('Setup Complete')

System version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]
Pandas version: 2.2.2
requests version: 2.32.3
Setup Complete


## 1 Specify directories

In [2]:
file_path = '../data/'
output_path = '../data/raw/'

## 2 Load data types and names

In [3]:
file_names = pd.read_csv(os.path.join(file_path, 'dataset_names.csv'))
display(file_names)

Unnamed: 0,type,name
0,gutenberg,pg_catalog.csv
1,byGenre,goodreads_books_children.json.gz
2,byGenre,goodreads_interactions_children.json.gz
3,complete,goodreads_book_genres_initial.json.gz
4,complete,goodreads_book_authors.json.gz


## 3 Construct the urls to download files

In [4]:
def create_file_name_url_mapping(file_names) -> dict:
    """
    Creates a dictionary mapping file names to their URLs based on type.

    Args:
        file_names (pandas.DataFrame): A DataFrame containing columns named 'name' and 'type'.

    Returns:
        dict: A dictionary mapping file names to their URLs.
    """

    if not isinstance(file_names, pd.DataFrame):
        raise TypeError("Input 'file_names' must be a pandas.DataFrame")

    # Validate column names
    if not all(col in file_names.columns for col in ['name', 'type']):
        raise ValueError(
            "Input DataFrame must contain columns 'name' and 'type'")

    file_name_type_mapping = dict(zip(file_names['name'].values, file_names['type'].values))
    file_name_url_mapping = {}

    for fname, ftype in file_name_type_mapping.items():
        try:
            if ftype == "gutenberg":
                url = f"https://www.gutenberg.org/cache/epub/feeds/{fname}"
            elif ftype == "complete":
                url = f"https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/{fname}"
            elif ftype == "byGenre":
                url = f"https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/{fname}"    
            else:
                raise ValueError(f"Unsupported file type: {ftype}")
        except KeyError as e:
            # Handle missing file types (optional)
            print(
                f"Warning: File type '{ftype}' not found in file_names['type']. Skipping file: {fname}")
            continue

        file_name_url_mapping[fname] = url

    return file_name_url_mapping

In [5]:
file_name_url_mapping = create_file_name_url_mapping(file_names)
print(file_name_url_mapping)

{'pg_catalog.csv': 'https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv', 'goodreads_books_children.json.gz': 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/goodreads_books_children.json.gz', 'goodreads_interactions_children.json.gz': 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/goodreads_interactions_children.json.gz', 'goodreads_book_genres_initial.json.gz': 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_book_genres_initial.json.gz', 'goodreads_book_authors.json.gz': 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_book_authors.json.gz'}


## 4 Download datasets

In [6]:
def download_datasets(output_path: str):
    """
    Downloads datasets based on the filenames specified in a CSV file.

    Args:
        output_path (str): Directory where the downloaded files will be saved.
    """
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for fname in file_name_url_mapping.keys():
        url = file_name_url_mapping[fname]
        local_filename = os.path.join(output_path, fname)
        try:
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(local_filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f'Dataset {fname} has been downloaded!')
        except Exception as e:
            print(f'Error downloading {fname}: {e}')


In [None]:
download_datasets(output_path)

## 5 References

- Mengting Wan, Julian McAuley, "[Item Recommendation on Monotonic Behavior Chains](https://github.com/MengtingWan/mengtingwan.github.io/raw/master/paper/recsys18_mwan.pdf)", in RecSys'18. [[bibtex](https://dblp.uni-trier.de/rec/bibtex/conf/recsys/WanM18)]
- Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "[Fine-Grained Spoiler Detection from Large-Scale Review Corpora](https://github.com/MengtingWan/mengtingwan.github.io/raw/master/paper/acl19_mwan.pdf)", in ACL'19. [[bibtex](https://dblp.uni-trier.de/rec/bibtex/conf/acl/WanMNM19)]
- Project Gutenberg. (n.d.). Project Gutenberg. https://www.gutenberg.org/ebooks/offline_catalogs.html