# Data Collection and Preprocessing

In [35]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import re

## Requesting webpage

In [3]:
#For sending request and obtainting a response
response = requests.get("https://en.wikipedia.org/wiki/Demonym")

#To check the status
response.status_code 

200

In [4]:
%%capture
dir(response)

In [5]:
#Header and metadata
response.headers #Returns a dictionary
date = response.headers["date"]
content_type = response.headers["content-type"]

print(f"Url     : {response.url}")
print(f"Date    : {date}")
print(f"Content : {content_type}")

Url     : https://en.wikipedia.org/wiki/Demonym
Date    : Fri, 01 Sep 2023 02:52:21 GMT
Content : text/html; charset=UTF-8


## Using BeautifulSoup

In [6]:
%%capture
soup = BeautifulSoup(response.content, "lxml")
print(soup.prettify())

In [18]:
div_cols = soup.find_all("div", class_="div-col", style="column-width: 22em;")

In [22]:
# Extract desired information
locations = []
demonyms = []

# Loop over each div element
for div_col in div_cols:
    for li in div_col.find_all('li'):
        # Check if ' → ' exists in the text
        if ' → ' in li.text:
            location, demonym = li.text.split(' → ')
            locations.append(location)
            demonyms.append(demonym)
        else:
            print(li)

<li><a href="/wiki/Cape_Town" title="Cape Town">Cape Town</a> - Capetonians</li>
<li><a href="/wiki/Preston,_Lancashire" title="Preston, Lancashire">Preston, Lancashire</a> →Prestonians</li>
<li><a class="mw-redirect" href="/wiki/Caguas" title="Caguas">Caguas</a> - Caguenos</li>
<li><a href="/wiki/Andhra_Pradesh" title="Andhra Pradesh">Andhra</a> - Andhrites</li>
<li><a href="/wiki/Telangana" title="Telangana">Telangana</a>→ Telanganites</li>
<li><a class="mw-redirect" href="/wiki/Nanking" title="Nanking">Nanjing</a> (Nanking/Nankin)→ Nankinese</li>
<li><a class="mw-redirect" href="/wiki/Poway,_CA" title="Poway, CA">Poway, CA</a>→ Powegians</li>
<li><a href="/wiki/Tasmania" title="Tasmania">Tasmania</a>→ Taswegians</li>
<li><a href="/wiki/Tatarstan" title="Tatarstan">Tatarstan</a> →Tatars</li>


In [24]:
# Store the extracted data into a pandas DataFrame
df = pd.DataFrame({
    'Location': locations,
    'Demonym': demonyms
})

In [26]:
df.head()

Unnamed: 0,Location,Demonym
0,Africa,Africans
1,Antarctica,Antarcticans
2,Asia,Asians
3,Australia,Australians
4,Europe,Europeans


In [28]:
df.shape

(991, 2)

In [27]:
# Save the DataFrame to a CSV file
path = '../data/loaction_demonym.csv'
df.to_csv(path, index=False)

## Cleaning the data

In [54]:
file_path = '../data/loaction_demonym.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [55]:
# Convert columns to lowercase
df['Location'] = df['Location'].str.lower()
df['Demonym'] = df['Demonym'].str.lower()

### Demonym Cleaning

In [56]:
# Extract the first word in the Demonyn column
def clean_demonym(s):
    # Extract first word inside quotes
    match = re.search(r'"([^"]+)"', s)
    if match:
        return match.group(1).split(",")[0].strip()
    
    # If not inside quotes, extract the first word before any special characters
    return re.split("[,;(\[][0-9]*", s)[0].strip()

df['Demonym'] = df['Demonym'].apply(clean_demonym)

### Location Cleaning

In [57]:
def clean_location(s):
    # Remove surrounding quotes if present
    s = s.strip('"')
    
    # Split by comma or parenthesis and keep the first part
    return re.split("[,(]", s)[0].strip()

df['Location'] = df['Location'].apply(clean_location)

In [58]:
# Replace '-' by space
df['Location'] = df['Location'].str.replace('-', ' ')
df['Demonym'] = df['Demonym'].str.replace('-', ' ')

In [59]:
# Save the DataFrame to a CSV file
path = '../data/loaction_demonym_clean.csv'
df.to_csv(path, index=False)