### Build The Dataset

#### Set up the key by using a parse

In [35]:
# Set up the key
import os
import configparser
# Use a parser for the configuration file
import pandas as pd

configs = configparser.ConfigParser()
# Get the current directory to the main file README.md
currentDir = os.path.dirname("README.md")
# Get the path file to the config file
configDir = os.path.join(currentDir, "config/settings.cfg")
configs.read(configDir)
# Get the key
apiKey = configs.get("nytimes", "api_key")

#### Find total numbers of articles in the topic

In [36]:
import requests
subject = "subject:Asian-Americans"
query = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&api-key={apiKey}'
response = requests.get(query).json()
numHits = response['response']['meta']['hits']

#### Structure the query and parse the file

In [37]:
# Method to set up and parse the
numPerPages = 10
import time
import dateutil
import pandas as pd


def getData() -> pd.DataFrame:
    """
    Main method to send request and parse response with the subject

    :return: Dataset with all the articles parsed as panda dataframe
    """
    # Result dataset
    dataset = {'headline': [],
        'date': [],
        'doc_type': [],
        'author': [],
        'section': [],
        'url': [],
        'news_desk': [],
        'material_type': [],
        'word_count': [],
        'keywords': []}
    dataset = pd.DataFrame(dataset)
    # Count number of articles
    total = 0
    # Loop through the page
    for page in range(numHits // numPerPages):
        # Send request to the page gradually
        q = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&page={page}&api-key={apiKey}'
        r = requests.get(q).json()
        # Based on preliminary parsing, get the list of article from the file
        articleList = r['response']['docs']
        # Return dataframe
        frame = getRequest(articleList)
        # Add to dataset
        dataset = dataset.append(frame, ignore_index=True)
        # Count pages
        total += len(frame)
        # Sleep before new request
        time.sleep(6)
        # Print message to know finish with the page
        print("Finish with page", page)
    # Print when done with the numbers of total articles
    print(f'Finished with all pages, total of {str(total)}')
    # Create csv file
    csv_path = "Asians American NYT Dataset.csv"
    dataset.to_csv(csv_path, index=False)
    return dataset


def isNotValid(article) -> bool:
    """
    Method to check if the article has a valid headline

    :param article: The information of the article
    :return: True if not valid, False if valid
    """
    if type(article['headline']) == dict and 'main' in article['headline'] and article['headline']['main'] is not None:
        return False
    return True


def getRequest(articleList) -> pd.DataFrame:
    """
    Method to parse article and return as a data frame

    :param articleList: list of article from response['response']['docs']
    :return: dataframe of the article after parsing
    """
    frame = {'headline': [],
        'date': [],
        'doc_type': [],
        'author': [],
        'section': [],
        'url': [],
        'news_desk': [],
        'material_type': [],
        'word_count': [],
        'keywords': []}
    for article in articleList:
        # Check if article is valid
        if isNotValid(article):
            continue
        # Date parse
        date = dateutil.parser.parse(article['pub_date']).date()
        frame['date'].append(date)
        # Headline parse
        frame['headline'].append(article['headline']['main'])
        # Document type parse
        frame['doc_type'].append(article['document_type'])
        # Author parse
        if 'person' in article['byline']:
            person = article['byline']['person']
            names = [name['firstname'] + " " + name['lastname'] for name in person if name['firstname'] and name['lastname']]
            frame['author'].append(names)
        else:
            frame['author'].append(None)
        # Section parse
        if 'section' in article:
            frame['section'].append(article['section_name'])
        else:
            frame['section'].append(None)
        # Link URL parse
        if 'web_url' in article:
            frame['url'].append(article['web_url'])
        else:
            frame['url'].append(None)
        # News Desk parse
        if 'news_desk' in article:
            frame['news_desk'].append(article['news_desk'])
        else:
            frame['news_desk'].append(article[None])
        # Material parse:
        if 'type_of_material' in article:
            frame['material_type'].append(article['type_of_material'])
        else:
            frame['material_type'].append(None)
        # Word count parse
        if 'word_count' in article:
            frame['word_count'].append(article['word_count'])
        else:
            frame['word_count'].append(article[None])
        # Keyword parse
        keywords = [(keyword['name'], keyword['value']) for keyword in article['keywords']]
        frame['keywords'].append(keywords)

    return pd.DataFrame(frame)


### Build the dataset into a csv file

In [38]:
data = getData()
data.head(10)

Finish with page 0
Finish with page 1
Finish with page 2
Finish with page 3
Finish with page 4
Finish with page 5
Finish with page 6
Finish with page 7
Finish with page 8
Finish with page 9
Finish with page 10
Finish with page 11
Finish with page 12
Finish with page 13
Finish with page 14
Finish with page 15
Finish with page 16
Finish with page 17
Finish with page 18
Finish with page 19
Finish with page 20
Finish with page 21
Finish with page 22
Finish with page 23
Finish with page 24
Finish with page 25
Finish with page 26
Finish with page 27
Finish with page 28
Finish with page 29
Finish with page 30
Finish with page 31
Finish with page 32
Finish with page 33
Finish with page 34
Finish with page 35
Finish with page 36
Finish with page 37
Finish with page 38
Finish with page 39
Finish with page 40
Finish with page 41
Finish with page 42
Finish with page 43
Finish with page 44
Finish with page 45
Finish with page 46
Finish with page 47
Finish with page 48
Finish with page 49
Finish wit

Unnamed: 0,headline,date,doc_type,author,section,url,news_desk,material_type,word_count,keywords
0,A Rising Star’s Career Was Cut Short. His Impa...,2021-07-19,article,[Andrew LaVallee],,https://www.nytimes.com/2021/07/19/books/antho...,Books,News,1516.0,"[(persons, So, Anthony Veasna (1992-2020)), (s..."
1,Shohei Ohtani Is Just the Star America’s Pasti...,2021-07-12,article,[Kurt Streeter],,https://www.nytimes.com/2021/07/12/sports/base...,Sports,News,1032.0,"[(subject, Baseball), (organizations, Los Ange..."
2,Anti-Asian Attacks Continue as City Reopens,2021-07-14,article,[Ashley Wong],,https://www.nytimes.com/2021/07/14/nyregion/at...,Metro,briefing,1114.0,"[(glocations, New York City), (subject, Hate C..."
3,"Fear, and Discord, Among Asian Americans Over ...",2021-07-18,article,[Thomas Fuller],,https://www.nytimes.com/2021/07/18/us/asian-at...,National,News,1387.0,"[(glocations, San Francisco (Calif)), (subject..."
4,Boston Overhauls Admissions to Exclusive Exam ...,2021-07-15,article,[Ellen Barry],,https://www.nytimes.com/2021/07/15/us/boston-s...,National,News,1480.0,"[(glocations, Boston (Mass)), (subject, Educat..."
5,‘No Vaccine for Racism’: Asian New Yorkers Sti...,2021-07-14,article,"[Ali Watkins, Jonah Bromwich]",,https://www.nytimes.com/2021/07/14/nyregion/as...,Metro,News,1348.0,"[(subject, Discrimination), (subject, Hate Cri..."
6,A ‘Rogue Ballerina’ Gives a Candid Account of ...,2021-07-14,article,[Gia Kourlas],,https://www.nytimes.com/2021/07/14/arts/dance/...,Arts&Leisure,News,1742.0,"[(persons, Pazcoguin, Georgina), (subject, Dan..."
7,Violinist Apologizes for ‘Culturally Insensiti...,2021-06-28,article,[Javier Hernández],,https://www.nytimes.com/2021/06/28/arts/music/...,Culture,News,577.0,"[(subject, Discrimination), (subject, Classica..."
8,"After Years of Sex and Lies, David Choe Is Rea...",2021-06-24,article,[Edmund Lee],,https://www.nytimes.com/2021/06/24/business/me...,Business,News,1510.0,"[(subject, Art), (subject, Television), (subje..."
9,Billie Eilish Apologizes for Lip-Syncing Anti-...,2021-06-22,article,[Anna Kambhampaty],,https://www.nytimes.com/2021/06/22/style/billi...,Styles,News,361.0,"[(persons, Eilish, Billie), (subject, Asian-Am..."


The dataset is built, but the last page is not properly incorporated, so I add the method to get the final page manually.
Changing the above code would mean rerun and wait for the dataset to repopulate, which is not something I want to do.
Some thins I notice looking at the dataset:

* Some lines in the dataset seem to only have a few words, then all blank
* Some articles are repeated: differ with some words on the headlines; most of the links are the same, only differ by the number.

In [None]:
def getFinalPage(dataset):
    # Send request to the page gradually
    q = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&page=161&api-key={apiKey}'
    r = requests.get(q).json()
    # Based on preliminary parsing, get the list of article from the file
    articleList = r['response']['docs']
    # Return dataframe
    frame = getRequest(articleList)
    # Add to dataset
    dataset = dataset.append(frame, ignore_index=True)
    # Create csv file
    csv_path = "Asians American NYT Dataset.csv"
    dataset.to_csv(csv_path, index=False)
    return dataset

getFinalPage(data)

### TODO TASKS
*[7.20] Meeting with my supervisor to discuss the dataset so far and the next steps to be taken*
- Clean the data for duplicates and empty lines
- Reorganize keywords with one-hot encoding for specific subject (hate crime, discrimination, coronavirus, etc)
- Drop unnecessary column, filter date range to start with the same date on NYT Covid dataset
- Explore the dataset with the news-desk count
- Aggregate by date with the sum of the specific subject, article count before merging
- Download NYT Covid dataset with dates
- Finalize merged dataset with the date
- Distribution curve and general exploratory tasks