## Build New Dataset

### TODO TASKS
*[7.20] Meeting with my supervisor to discuss the dataset so far and the next steps to be taken*
- Clean the data for duplicates and empty lines
- Reorganize keywords with one-hot encoding for specific subject (hate crime, discrimination, coronavirus, etc)
- Drop unnecessary column, filter date range to start with the same date on NYT Covid dataset
- Explore the dataset with the news-desk count
- Aggregate by date with the sum of the specific subject, article count before merging
- Download NYT Covid dataset with dates
- Finalize merged dataset with the date
- Distribution curve and general exploratory tasks

#### Tags to be considered
- 'subject', 'Assaults'
- 'subject', 'Minorities'
- 'subject', 'Workplace Hazards and Violations'
- 'subject', 'Coronavirus (2019-nCoV)'
- 'subject', 'Mass Shootings'
- 'subject', 'Quarantine (Life and Culture)'
- 'subject', 'Discrimination'
- 'subject', 'Demonstrations, Protests and Riots'
- 'subject', 'Murders, Attempted Murders and Homicides'
- 'subject', 'Race and Ethnicity'
- 'subject', 'Atlanta Spa Shootings (2021)'
- 'subject', 'Hate Crimes'
- 'subject', 'Apologies'


#### Set up the key and use a configparser to hide the key details

In [None]:
# Set up the key
import os
import configparser
# Use a parser for the configuration file
import numpy as np
import pandas as pd

configs = configparser.ConfigParser()
# Get the current directory to the main file README.md
currentDir = os.path.dirname("README.md")
# Get the path file to the config file
configDir = os.path.join(currentDir, "config/settings.cfg")
configs.read(configDir)
# Get the key
apiKey = configs.get("nytimes", "api_key")

#### Find total numbers of articles in the topic with the numbers of hits

In [None]:
import requests
subject = "subject:Asian-Americans"
query = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&sort=newest&api-key={apiKey}'
response = requests.get(query).json()
numHits = response['response']['meta']['hits']

In [None]:
# Method to set up and parse the
import time
import dateutil
import pandas as pd
from datetime import datetime
day = datetime(2020, 1, 21).date()
numPerPages = 10
def isNotValid(article) -> bool:
    """
    Method to check if the article has a valid headline

    :param article: The information of the article
    :return: True if not valid, False if valid
    """
    if type(article['headline']) == dict and 'main' in article['headline'] and article['headline']['main'] is not None:
        return False
    return True

def getDataSorted() -> pd.DataFrame:
    """
    Main method to send request and parse response with the subject

    :return: Dataset with all the articles parsed as panda dataframe
    """
    # Result dataset
    dataset = {'headline': [],
        'date': [],
        'news_desk': [],
        'word_count': [],
        'Hate Crimes': [],
        'Discrimination': [],
        'Race and Ethnicity': [],
        'Atlanta Spa Shootings (2021)': [],
        'Murders, Attempted Murders and Homicides': [],
        'Demonstrations, Protests and Riots': [],
        'Mass Shootings': [],
        'Quarantine (Life and Culture)': [],
        'Assaults': [],
        'Minorities': [],
        'Workplace Hazards and Violations': [],
        'Coronavirus (2019-nCoV)': [],
        'url': []}

    breakCondition = False
    dataset = pd.DataFrame(dataset)
    # Count number of articles
    total = 0
    # Loop through the page
    for page in range(numHits // numPerPages + 1):
        # Send request to the page gradually
        q = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={subject}&page={page}&sort=newest&api-key={apiKey}'
        r = requests.get(q).json()
        # Based on preliminary parsing, get the list of article from the file
        articleList = r['response']['docs']
        # Return dataframe
        frame = getRequestUpdate(articleList, breakCondition)
        # Return
        if len(frame) == 0:
            break
        # Add to dataset
        dataset = dataset.append(frame, ignore_index=True)
        # Count pages
        total += len(frame)
        # Sleep before new request
        time.sleep(6)
        # Print message to know finish with the page
        print("Finish with page", page)
    # Print when done with the numbers of total articles
    print(f'Finished with all pages, total of {str(total)}')
    # Create csv file
    csv_path = "Updated Asians American NYT Dataset.csv"
    dataset.to_csv(csv_path, index=False)
    return dataset

tags = {'Hate Crimes',
        'Discrimination',
        'Race and Ethnicity',
        'Atlanta Spa Shootings (2021)',
        'Murders, Attempted Murders and Homicides',
        'Demonstrations, Protests and Riots',
        'Mass Shootings',
        'Quarantine (Life and Culture)',
        'Assaults',
        'Minorities',
        'Workplace Hazards and Violations',
        'Coronavirus (2019-nCoV)'}

def keywordCheck(frame, keywords, num):
    """
    Method to do one-hot encoding for the keywords

    :param num: The article number on the list
    :param frame: the current dataframe
    :param keywords: the list of keywords
    :return: None
    """
    for keyword in keywords:
        if keyword['name'] != 'subject':
            continue
        # All the keywords are subject
        if keyword['value'] == 'Assaults':
            frame['Assaults'].append(1)
        # minorities
        elif keyword['value'] == 'Minorities':
            frame['Minorities'].append(1)
        # hazards
        elif keyword['value'] == 'Workplace Hazards and Violations':
            frame['Workplace Hazards and Violations'].append(1)
        # covid 19
        elif keyword['value'] == 'Coronavirus (2019-nCoV)':
            frame['Coronavirus (2019-nCoV)'].append(1)
        # shootings
        elif keyword['value'] =='Mass Shootings' :
            frame['Mass Shootings'].append(1)
        # quarantine
        elif keyword['value'] == 'Quarantine (Life and Culture)' :
            frame['Quarantine (Life and Culture)'].append(1)
        # Discrimination
        elif keyword['value'] == 'Discrimination':
            frame['Discrimination'].append(1)
        # Protest and riot
        elif keyword['value'] == 'Demonstrations, Protests and Riots':
            frame['Demonstrations, Protests and Riots'].append(1)
        # Murders, homicides
        elif keyword['value'] == 'Murders, Attempted Murders and Homicides':
            frame['Murders, Attempted Murders and Homicides'].append(1)
        # Race and ethnicity
        elif keyword['value'] == 'Race and Ethnicity':
            frame['Race and Ethnicity'].append(1)
        # Atlanta
        elif keyword['value'] == 'Atlanta Spa Shootings (2021)':
            frame['Atlanta Spa Shootings (2021)'].append(1)
        # Discrimination
        elif keyword['value'] == 'Hate Crimes':
            frame['Hate Crimes'].append(1)

    for tag in tags:
        # print(frame[tag])
        while len(frame[tag]) < num + 1:
            frame[tag].append(0)


def getRequestUpdate(articleList, breakCondition) -> pd.DataFrame:
    """
    Method to parse article and return as a data frame

    :param breakCondition:
    :param articleList: list of article from response['response']['docs']
    :return: dataframe of the article after parsing
    """
    frame = {'headline': [],
        'date': [],
        'news_desk': [],
        'word_count': [],
        'Hate Crimes': [],
        'Discrimination': [],
        'Race and Ethnicity': [],
        'Atlanta Spa Shootings (2021)': [],
        'Murders, Attempted Murders and Homicides': [],
        'Demonstrations, Protests and Riots': [],
        'Mass Shootings': [],
        'Quarantine (Life and Culture)': [],
        'Assaults': [],
        'Minorities': [],
        'Workplace Hazards and Violations': [],
        'Coronavirus (2019-nCoV)': [],
        'url': []}
    for idx, article in enumerate(articleList):
        # Check if article is valid
        if isNotValid(article):
            continue
        # Date parse
        date = dateutil.parser.parse(article['pub_date']).date()
        # Return when the date is larger
        if date < day:
            break
        frame['date'].append(str(date))
        # Headline parse
        frame['headline'].append(article['headline']['main'])
        # Link URL parse
        frame['url'].append(article['web_url'])

        # News Desk parse
        if 'news_desk' in article:
            frame['news_desk'].append(article['news_desk'])
        else:
            frame['news_desk'].append(article[None])
        # Word count parse
        if 'word_count' in article:
            frame['word_count'].append(article['word_count'])
        else:
            frame['word_count'].append(article[None])

        keywordCheck(frame, article['keywords'], idx)

    return pd.DataFrame(frame)

updatedData = getDataSorted()
updatedData.head(10)
