### Build The Dataset


*Exploratory work to familiarize myself with New York Times API requests and dataset format*

#### Set up the key and usin a configparser to hide the key details

In [56]:
# Set up the key
import os
import configparser
# Use a parser for the configuration file
import pandas as pd

configs = configparser.ConfigParser()
# Get the current directory to the main file README.md
currentDir = os.path.dirname("README.md")
# Get the path file to the config file
configDir = os.path.join(currentDir, "config/settings.cfg")
configs.read(configDir)
# Get the key
apiKey = configs.get("nytimes", "api_key")

#### Find total numbers of articles in the topic with the numbers of hits

In [57]:
import requests
subject = "subject:Asian-Americans"
query = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&sort=newest&api-key={apiKey}'
response = requests.get(query).json()
numHits = response['response']['meta']['hits']

#### Structure the query and parse the file

In [58]:
# Method to set up and parse the
numPerPages = 10
import time
import dateutil
import pandas as pd


def getData() -> pd.DataFrame:
    """
    Main method to send request and parse response with the subject

    :return: Dataset with all the articles parsed as panda dataframe
    """
    # Result dataset
    dataset = {'headline': [],
        'date': [],
        'doc_type': [],
        'author': [],
        'section': [],
        'url': [],
        'news_desk': [],
        'material_type': [],
        'word_count': [],
        'keywords': []}
    dataset = pd.DataFrame(dataset)
    # Count number of articles
    total = 0

    # Loop through the page
    for page in range(numHits // numPerPages + 1):
        # Send request to the page gradually
        q = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&page={page}&sort=newest&api-key={apiKey}'
        r = requests.get(q).json()
        # Based on preliminary parsing, get the list of article from the file
        articleList = r['response']['docs']
        # Return dataframe
        frame = getRequest(articleList)
        # Add to dataset
        dataset = dataset.append(frame, ignore_index=True)
        # Count pages
        total += len(frame)
        # Sleep before new request
        time.sleep(6)
        # Print message to know finish with the page
        print("Finish with page", page)

    # Print when done with the numbers of total articles
    print(f'Finished with all pages, total of {str(total)}')
    # Create csv file
    csv_path = "Asians American NYT Dataset.csv"
    dataset.to_csv(csv_path, index=False)
    return dataset


def isNotValid(article) -> bool:
    """
    Method to check if the article has a valid headline

    :param article: The information of the article
    :return: True if not valid, False if valid
    """
    if type(article['headline']) == dict and 'main' in article['headline'] and article['headline']['main'] is not None:
        return False
    return True


def getRequest(articleList) -> pd.DataFrame:
    """
    Method to parse article and return as a data frame

    :param articleList: list of article from response['response']['docs']
    :return: dataframe of the article after parsing
    """
    frame = {'headline': [],
        'date': [],
        'doc_type': [],
        'author': [],
        'section': [],
        'url': [],
        'news_desk': [],
        'material_type': [],
        'word_count': [],
        'keywords': []}
    for article in articleList:
        # Check if article is valid
        if isNotValid(article):
            continue
        # Date parse
        date = dateutil.parser.parse(article['pub_date']).date()
        frame['date'].append(date)
        # Headline parse
        frame['headline'].append(article['headline']['main'])
        # Document type parse
        frame['doc_type'].append(article['document_type'])
        # Author parse
        if 'person' in article['byline']:
            person = article['byline']['person']
            names = [name['firstname'] + " " + name['lastname'] for name in person if name['firstname'] and name['lastname']]
            frame['author'].append(names)
        else:
            frame['author'].append(None)
        # Section parse
        if 'section' in article:
            frame['section'].append(article['section_name'])
        else:
            frame['section'].append(None)
        # Link URL parse
        if 'web_url' in article:
            frame['url'].append(article['web_url'])
        else:
            frame['url'].append(None)
        # News Desk parse
        if 'news_desk' in article:
            frame['news_desk'].append(article['news_desk'])
        else:
            frame['news_desk'].append(article[None])
        # Material parse:
        if 'type_of_material' in article:
            frame['material_type'].append(article['type_of_material'])
        else:
            frame['material_type'].append(None)
        # Word count parse
        if 'word_count' in article:
            frame['word_count'].append(article['word_count'])
        else:
            frame['word_count'].append(article[None])
        # Keyword parse
        keywords = [(keyword['name'], keyword['value']) for keyword in article['keywords']]
        frame['keywords'].append(keywords)

    return pd.DataFrame(frame)

def getFinalPage(dataset):
    # Send request to the page gradually
    q = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&page=161&api-key={apiKey}'
    r = requests.get(q).json()
    # Based on preliminary parsing, get the list of article from the file
    articleList = r['response']['docs']
    # Return dataframe
    frame = getRequest(articleList)
    # Add to dataset
    dataset = dataset.append(frame, ignore_index=True)
    # Create csv file
    csv_path = "Asians American NYT Dataset.csv"
    dataset.to_csv(csv_path, index=False)
    return dataset

### TODO TASKS
*[7.20] Meeting with my supervisor to discuss the dataset so far and the next steps to be taken*
- Clean the data for duplicates and empty lines
- Reorganize keywords with one-hot encoding for specific subject (hate crime, discrimination, coronavirus, etc)
- Drop unnecessary column, filter date range to start with the same date on NYT Covid dataset
- Explore the dataset with the news-desk count
- Aggregate by date with the sum of the specific subject, article count before merging
- Download NYT Covid dataset with dates
- Finalize merged dataset with the date
- Distribution curve and general exploratory tasks

#### Tags to be considered
- 'subject', 'Assaults'
- 'subject', 'Minorities'
- 'subject', 'Workplace Hazards and Violations'
- 'subject', 'Coronavirus (2019-nCoV)'
- 'subject', 'Mass Shootings'
- 'subject', 'Quarantine (Life and Culture)'
- 'subject', 'Discrimination'
- 'subject', 'Demonstrations, Protests and Riots'
- 'subject', 'Murders, Attempted Murders and Homicides'
- 'subject', 'Race and Ethnicity'
- 'subject', 'Atlanta Spa Shootings (2021)'
- 'subject', 'Hate Crimes'
- 'subject', 'Apologies'