# Crawling for african-american

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd 

# Define the URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/Lists_of_African_Americans"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Initialize lists to store the extracted first names and last names
    first_names = []
    last_names = []

    # Extract names using regular expressions
    name_pattern = re.compile(r'^\s*([^0-9\[\]\(\)]+)\s*$')
    for item in soup.find_all("li"):
        match = name_pattern.match(item.text.strip())
        if match:
            name_parts = match.group(1).split()
            if len(name_parts) >= 2:
                first_name = name_parts[0]
                last_name = ' '.join(name_parts[1:])
                first_names.append(first_name)
                    # Remove quotation marks and anything after the second comma
                last_name = last_name.split(',')[0].strip().replace('"', '')
                last_name = last_name.split(':')[0].strip().replace('"', '')
                last_name = last_name.split(' - ')[0].strip().replace('"', '')
            
                last_names.append(last_name)

    # Create a CSV file and write the first names and last names with the "Category" column
    with open("African_American_Names.csv", mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["First Name", "Last Name", "Category"])
        row_counter = 0
        for first_name, last_name in zip(first_names, last_names):
            # Add "black" in the "Category" column
            if row_counter >= 60 and row_counter < 2164:  # Delete rows 61 to 2163
                writer.writerow([first_name, last_name, "black"])
            row_counter += 1

    print("CSV file 'African_American_Names.csv' has been created and modified.")

else:
    print("Failed to retrieve the Wikipedia page. Status code:", response.status_code)



# Specify the path to the CSV file
file_path = "/Users/eurysohn/Desktop/RA codes/Wikipedia/African_American_Names.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# List of keywords to be removed
keywords = ['actress', 'singer', 'rapper', 'and',"   ", 'pop', "  ", 'Pop', 'actor', 'Hip hop', 'R&B','the Creator', 'United States' 'comedian', 'Blues', 'Christian hip hop', "Hip Hop", "Hip Hop producer", "disc jockey", "The Rapper", "producer"]

# Function to replace specific words in a text with an empty string
def replace_keywords(text):
    for keyword in keywords:
        text = text.replace(keyword, '')
    return text

# Iterate over each cell in the DataFrame and apply the function
for col in df.columns:
    df[col] = df[col].apply(lambda x: replace_keywords(str(x)))

df = df[~df.apply(lambda row: "African-American" in str(row), axis=1)]
df = df[~df.apply(lambda row: "African American" in str(row), axis=1)]
df = df[~df.apply(lambda row: "Beyoncé" in str(row), axis=1)]

def remove_after_dash(cell):
    return re.split(r'\s*[-–—]\s*', str(cell))[0]

# Apply the function to each cell in the DataFrame
df = df.applymap(remove_after_dash)

# Drop rows where any column has an empty value
# Replace empty strings with NaN
df.replace(["", " "], pd.NA, inplace=True)

# Drop rows where any cell is NaN
df.dropna(inplace=True)
# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

print("Specified keywords have been removed from the file!")



CSV file 'African_American_Names.csv' has been created and modified.
Specified keywords have been removed from the file!


# Crawl for Asian

In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd 

# Define the URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_Asian_Americans"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Initialize lists to store the extracted first names and last names
    first_names = []
    last_names = []

    # Extract names using regular expressions
    name_pattern = re.compile(r'^\s*([^0-9\[\]\(\)]+)\s*$')
    for item in soup.find_all("li"):
        match = name_pattern.match(item.text.strip())
        if match:
            name_parts = match.group(1).split()
            if len(name_parts) >= 2:
                first_name = name_parts[0]
                last_name = ' '.join(name_parts[1:])
                first_names.append(first_name)
                    # Remove quotation marks and anything after the second comma
                last_name = last_name.split(',')[0].strip().replace('"', '')
                last_name = last_name.split(':')[0].strip().replace('"', '')
                last_name = last_name.split(' - ')[0].strip().replace('"', '')
            
                last_names.append(last_name)

# Create a CSV file and write the first names and last names with the "Category" column
    with open("Asian_American_Names.csv", mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["First Name", "Last Name", "Category"])
        row_counter = 0
        for first_name, last_name in zip(first_names, last_names):
            #Add "asian" in the "Category" column
            if row_counter >= 27 and row_counter < 855:  # Delete rows 61 to 2163
                writer.writerow([first_name, last_name, "asian"])
            row_counter += 1

    print("CSV file 'Asian_American_Names.csv' has been created and modified.")

else:
    print("Failed to retrieve the Wikipedia page. Status code:", response.status_code)



CSV file 'Asian_American_Names.csv' has been created and modified.


In [None]:


# Specify the path to the CSV file
file_path = "/Users/eurysohn/Desktop/RA codes/Asian_American_Names.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# List of keywords to be removed
keywords = ['actress', 'singer', 'rapper', 'and',"   ", "  ", 'pop','Pop', 'actor', 'Hip hop', 'R&B','the Creator', 'United States' 'comedian', 'Blues', 'Christian hip hop', "Hip Hop", "Hip Hop producer", "disc jockey", "The Rapper", "producer"]

# Function to replace specific words in a text with an empty string
def replace_keywords(text):
    for keyword in keywords:
        text = text.replace(keyword, '')
    return text

# Iterate over each cell in the DataFrame and apply the function
for col in df.columns:
    df[col] = df[col].apply(lambda x: replace_keywords(str(x)))

df = df[~df.apply(lambda row: "Asian-American" in str(row), axis=1)]
df = df[~df.apply(lambda row: "Asian American" in str(row), axis=1)]


def remove_after_dash(cell):
    return re.split(r'\s*[-–—]\s*', str(cell))[0]

# Apply the function to each cell in the DataFrame
df = df.applymap(remove_after_dash)

# Drop rows where any column has an empty value
# Replace empty strings with NaN
df.replace(["", " "], pd.NA, inplace=True)

# Drop rows where any cell is NaN
df.dropna(inplace=True)
# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

print("Specified keywords have been removed from the file!")



Specified keywords have been removed from the file!


# Hispanics

In [4]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd 

# Define the URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_Hispanic_and_Latino_Americans"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Initialize lists to store the extracted first names and last names
    first_names = []
    last_names = []

    # Extract names using regular expressions
    name_pattern = re.compile(r'^\s*([^0-9\[\]\(\)]+)\s*$')
    for item in soup.find_all("li"):
        match = name_pattern.match(item.text.strip())
        if match:
            name_parts = match.group(1).split()
            if len(name_parts) >= 2:
                first_name = name_parts[0]
                last_name = ' '.join(name_parts[1:])
                first_names.append(first_name)
                    # Remove quotation marks and anything after the second comma
                last_name = last_name.split(',')[0].strip().replace('"', '')
                last_name = last_name.split(':')[0].strip().replace('"', '')
                last_name = last_name.split(' - ')[0].strip().replace('"', '')
                last_name = last_name.split(';')[0].strip().replace('"', '')
                last_name = last_name.split('&')[0].strip().replace('"', '') 
                last_names.append(last_name)

# Create a CSV file and write the first names and last names with the "Category" column
    with open("Hispanic_Names.csv", mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["First Name", "Last Name", "Category"])
        row_counter = 0
        for first_name, last_name in zip(first_names, last_names):
            #Add "asian" in the "Category" column
            if row_counter >= 103 and row_counter < 529:  # Delete rows 61 to 2163
                writer.writerow([first_name, last_name, "hispanic"])
            row_counter += 1

    print("CSV file 'Hispanic_Names.csv' has been created and modified.")

else:
    print("Failed to retrieve the Wikipedia page. Status code:", response.status_code)



CSV file 'Hispanic_Names.csv' has been created and modified.


In [None]:


# Specify the path to the CSV file
file_path = "/Users/eurysohn/Desktop/RA codes/Hispanic_Names.csv"
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# List of keywords to be removed
keywords = ['Bachata','from Cypress Hill', 'Writermember of The Diplomats', 'Tejano group', 'pop', 'Grammy', 'from Brooklyn', 'Cuban American', 'Mexican', 'Mexican American', 'actress', 'singer', 'rapper', 'and',"   ", "  ", 'Pop', 'actor', 'Hip hop', 'R&B','the Creator', 'United States' 'comedian', 'Blues', 'Christian hip hop', "Hip Hop", "Hip Hop producer", "disc jockey", "The Rapper", "producer"]

# Function to replace specific words in a text with an empty string
def replace_keywords(text):
    for keyword in keywords:
        text = text.replace(keyword, '')
    return text

# Iterate over each cell in the DataFrame and apply the function
for col in df.columns:
    df[col] = df[col].apply(lambda x: replace_keywords(str(x)))



def remove_after_dash(cell):
    return re.split(r'\s*[-–—]\s*', str(cell))[0]

# Apply the function to each cell in the DataFrame
df = df.applymap(remove_after_dash)

# Drop rows where any column has an empty value
# Replace empty strings with NaN
df.replace(["", " "], pd.NA, inplace=True)

# Drop rows where any cell is NaN
df.dropna(inplace=True)
# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

# Write the modified DataFrame back to the CSV file
df.to_csv(file_path, index=False)

print("Specified keywords have been removed from the file!")



Specified keywords have been removed from the file!
