## Importing libs

In [None]:
import re
from geopy import geocoders
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import pandas as pd
import shutil
import glob, os, time
import sys  
cwd = os.getcwd()
from Script_utils import *

## Get all songs from U.S. and list separate articles

In [None]:
base_URL = 'https://en.wikipedia.org'
all_songs_URL = base_URL + '/wiki/List_of_songs_about_cities'

In [None]:
page_soup = load_page(all_songs_URL)

us_h2 = page_soup.select('h2 > span.mw-headline#United_States')[0].parent

In [None]:
cities = []
states = []
songs = []


for element in us_h2.next_siblings:

    if element.name == 'h2': break;
    if element.name == 'h3':
        city_state = get_city_and_state(element.getText())
        start_point = element

        for list_element in start_point.next_siblings:

            if list_element.name == 'h3': break
            if list_element.name == 'ul':

                for song_element in list_element.select('li'):
                    song_name = get_song_name(song_element.getText())
                    songs.append(song_name)
                    cities.append(city_state[0])
                    states.append(city_state[1])

            if list_element.name == 'div':
                single_aricle_URL = list_element.select('a',herf=True)[0]['href']
                print(single_aricle_URL)
                continue

- /wiki/List_of_songs_about_Atlanta ✅
- /wiki/List_of_songs_about_Birmingham,_Alabama 🚧
- /wiki/List_of_songs_about_Boston ✅
- /wiki/List_of_songs_about_Chicago ✅
- /wiki/List_of_songs_about_Detroit ✅
- /wiki/List_of_songs_about_Los_Angeles 🚧
- /wiki/List_of_songs_about_Miami ✅
- /wiki/List_of_songs_about_Nashville ✅
- /wiki/List_of_songs_about_New_Orleans ✅
- /wiki/List_of_songs_about_New_York_City ✅
- /wiki/List_of_songs_about_Portland,_Oregon ✅
- /wiki/List_of_songs_about_Seattle 🚧

✅ - csv is generated <br/>
🚧 - work in progress

In [None]:
print(f'Songs {len(songs)} | Cities: {len(cities)} | States: {len(states)}')

In [None]:
export_data = pd.DataFrame(data={'City':cities,'State':states,'Song':songs})

In [None]:
export_data.info()

In [None]:
rows_with_nan = export_data[export_data.isnull().any(axis=1)]
print(rows_with_nan)
print(f'# of rows with NaN: {len(rows_with_nan)}')

There were 18 rows with NaN values coused by issues below:

1. Missing cloasing quote
- Details: I noticed some of the songs in the Wikipedia article don't have closing quote (!) and my regex fail.
- Solution: I could make more complex regex to handle this situation, but instead I edit Wikipedia article. There are no so many of that rows and it'll help future scrapers :)

2. Different quote type
- Details: I found out there are many different type of quotation marks in UNICODE.
- Solution: I need to sanitize song names a bit

In [None]:
export_data.to_csv(cwd + '/datasets/Data_main.csv',index=False)
export_data.info()

## Scraping data for each big city

Unfortunately, every article has a slightly different structure so I had to scrape them separately.

In [None]:
def scrape_single_aricle(URL, start_tag,start_id, stop_tag, city_name, state_name):
    page_soup = load_page(base_URL + URL)

    start = page_soup.find(start_tag,id=start_id).parent

    songs = []
    cities = []
    states = []

    for element in start.next_siblings:
        if element.name == stop_tag: break
        if element.name == 'ul':
            for li in element.select('li'):
                song_name = get_song_name(li.getText())
                songs.append(song_name)
                cities.append(city_name)
                states.append(state_name)
    return {
        'songs':songs,
        'cities':cities,
        'states':states
    }

### Detroit

In [None]:
detroit_results = scrape_single_aricle(
    URL = '/wiki/List_of_songs_about_Detroit',
    start_tag = 'span',
    start_id='0-9',
    stop_tag = 'div',
    city_name = 'Detroit',
    state_name = 'Michigan')

In [None]:
export_data_detroit = pd.DataFrame(data={'City':detroit_results['cities'],'State':detroit_results['states'],'Song':detroit_results['songs']})
export_data_detroit.to_csv(cwd + '/datasets/Data_detroit.csv',index=False)
export_data_detroit.info()

### Miami

In [None]:
miami_results = scrape_single_aricle(
    URL = '/wiki/List_of_songs_about_Miami',
    start_tag = 'span',
    start_id='Songs_about_Miami',
    stop_tag = 'h2',
    city_name = 'Miami',
    state_name = 'Florida')

In [None]:
export_data_miami = pd.DataFrame(data={'City':miami_results['cities'],'State':miami_results['states'],'Song':miami_results['songs']})
export_data_miami.to_csv(cwd + '/datasets/Data_miami.csv',index=False)
export_data_miami.info()

### New Orleans

In [None]:
new_orleans_results = scrape_single_aricle(
    URL = '/wiki/List_of_songs_about_New_Orleans',
    start_tag = 'span',
    start_id='0-9',
    stop_tag = 'div',
    city_name = 'New Orleans',
    state_name = 'Louisiana')

In [None]:
export_data_new_orleans = pd.DataFrame(data={'City':new_orleans_results['cities'],'State':new_orleans_results['states'],'Song':new_orleans_results['songs']})
export_data_new_orleans.to_csv(cwd + '/datasets/Data_new_orleans.csv',index=False)
export_data_new_orleans.info()

### Los Angeles

In [None]:
los_angeles_results = scrape_single_aricle(
    URL = '/wiki/List_of_songs_about_Los_Angeles',
    start_tag = 'span',
    start_id='#s–A',
    stop_tag = 'h2',
    city_name = 'Los Angeles',
    state_name = 'California')

In [None]:
export_data_los_angeles = pd.DataFrame(data={'City':los_angeles_results['cities'],'State':los_angeles_results['states'],'Song':los_angeles_results['songs']})
export_data_los_angeles.to_csv(cwd + '/datasets/Data_los_angeles.csv',index=False)
export_data_los_angeles.info()

### Chicago

In [None]:
chicago_results = scrape_single_aricle(
    URL = '/wiki/List_of_songs_about_Chicago',
    start_tag = 'span',
    start_id='0–9',
    stop_tag = 'p',
    city_name = 'Chicago',
    state_name = 'Illinois')

In [None]:
export_data_chicago = pd.DataFrame(data={'City':chicago_results['cities'],'State':chicago_results['states'],'Song':chicago_results['songs']})
export_data_chicago.to_csv(cwd + '/datasets/Data_chicago.csv',index=False)
export_data_chicago.info()

### New York City

In [None]:
page_soup = load_page(base_URL + '/wiki/List_of_songs_about_New_York_City')

start = page_soup.find("span",id="0–9").parent

songs = []
cities = []
states = []

for element in start.next_siblings:
    if element.name == 'p': break
    if element.name == 'div':
        el = element.select('ul')[0]
        for li in el.select('li'):
            song_name = get_song_name(li.getText())
            songs.append(song_name)
            cities.append('New York City')
            states.append('New York')

In [None]:
export_data_new_york_city = pd.DataFrame(data={'City':cities,'State':states,'Song':songs})
export_data_new_york_city.to_csv(cwd + '/datasets/Data_new_york_city.csv',index=False)
export_data_new_york_city.info()

### Nashville

In [None]:
page_soup = load_page(base_URL + '/wiki/List_of_songs_about_Nashville,_Tennessee')

start = page_soup.find("span",id="C").parent

songs = []
cities = []
states = []

for element in start.next_siblings:
    if element.name == 'h2' and element.select('span#References'): break
    if element.name == 'ul':
        for li in element.select('li'):
            song_name = get_song_name(li.getText())
            songs.append(song_name)
            cities.append('Nashville')
            states.append('Tennessee')

In [None]:
export_data_nashville = pd.DataFrame(data={'City':cities,'State':states,'Song':songs})
export_data_nashville.to_csv(cwd + '/datasets/Data_nashville.csv',index=False)
export_data_nashville.info()

### Atlanta

In [None]:
page_soup = load_page(base_URL + '/wiki/List_of_songs_about_Atlanta')

songs_list = page_soup.find("ul")

songs = []
cities = []
states = []

for li in songs_list.select('li'):
    song_name = get_song_name(li.getText())
    songs.append(song_name)
    cities.append('Atlanta')
    states.append('Georgia')

In [None]:
export_data_atlanta = pd.DataFrame(data={'City':cities,'State':states,'Song':songs})
export_data_atlanta.to_csv(cwd + '/datasets/Data_atlanta.csv',index=False)
export_data_atlanta.info()

### Boston

In [None]:
page_soup = load_page(base_URL + '/wiki/List_of_songs_about_Boston')

songs_list = page_soup.find("ul")

songs = []
cities = []
states = []

for li in songs_list.select('li'):
    song_name = get_song_name(li.getText())
    songs.append(song_name)
    cities.append('Boston')
    states.append('Massachusetts')

In [None]:
export_data_boston = pd.DataFrame(data={'City':cities,'State':states,'Song':songs})
export_data_boston.to_csv(cwd + '/datasets/Data_boston.csv',index=False)
export_data_boston.info()

### Portland

In [None]:
page_soup = load_page(base_URL + '/wiki/List_of_songs_about_Portland,_Oregon')

songs_list = page_soup.find("ul")

songs = []
cities = []
states = []

for li in songs_list.select('li'):
    song_name = get_song_name(li.getText())
    songs.append(song_name)
    cities.append('Portland')
    states.append('Oregon')

In [None]:
export_data_portland = pd.DataFrame(data={'City':cities,'State':states,'Song':songs})
export_data_portland.to_csv(cwd + '/datasets/Data_portland.csv',index=False)
export_data_portland.info()

## Getting lat and long from GeoPy

In [None]:
geolocator = Nominatim(user_agent="my_app")

In [None]:
separate_cities = [
    {
        'city':'Los Angeles',
        'state':'California'
    },
    {
        'city':'Atlanta',
        'state':'Georgia'
    },
    {
        'city':'Boston',
        'state':'Massachusetts'
    },
    {
        'city':'Chicago',
        'state':'Illinois'
    },
    {
        'city':'Detroit',
        'state':'Michigan'
    },
    {
        'city':'Miami',
        'state':'Florida'
    },
    {
        'city':'Nashville',
        'state':'Tennessee'
    },
    {
        'city':'New Orleans',
        'state':'Louisiana'
    },
    {
        'city':'New York City',
        'state':'New York'
    },
    {
        'city':'Portland',
        'state':'Oregon'
    }
]

In [None]:
i = 0
while i < len(separate_cities):
    try:
        loc = geolocator.geocode(f'{separate_cities[i]["city"]},{separate_cities[i]["state"]} United States')
    except Exception:
        print(f'🛑 Can\'t generate coordinates for {separate_cities[i]["city"]}')
        continue

    dataset_to_change_PATH = cwd + f'/datasets/Data_{normalized_city_name(separate_cities[i]["city"])}.csv'

    data_to_change = pd.read_csv(dataset_to_change_PATH, index_col=[0])
    data_to_change['lat'] = loc.latitude
    data_to_change['long'] = loc.longitude
    data_to_change.to_csv(dataset_to_change_PATH)
    print(f'✅ Coordinates for {separate_cities[i]["city"]} generated!')
    i+=1



## Get lat and long for main dataset

*Note: GeoPy allows for 2500 requests per day*

In [None]:
main_dataset_import = pd.read_csv(cwd + '/datasets/Data_main.csv')
main_dataset = main_dataset_import.copy()
main_dataset

In [None]:
main_dataset['Location Name'] = main_dataset.apply(lambda row: f'{row["City"]}, {row["State"]} United States', axis=1)
main_dataset

In [None]:
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=2)

main_dataset['Geo Location'] = main_dataset['Location Name'].apply(geocode)

main_dataset['lat'] = main_dataset['Geo Location'].apply(lambda loc: loc.latitude if loc else None)
main_dataset['long'] = main_dataset['Geo Location'].apply(lambda loc: loc.longitude if loc else None)

In [None]:
main_dataset = main_dataset.drop(columns=['Location Name','Geo Location'])

In [None]:
main_dataset.to_csv(cwd + '/datasets/Data_main.csv', index=False)

Removing unused columns for easier merge

In [None]:
main_dataset.info()

## Merge all datasets

In [None]:
merged_data = pd.concat(map(pd.read_csv, glob.glob('datasets/*.csv')), ignore_index=True)
merged_data.to_csv(cwd + '/Data_merged.csv')
merged_data

In [None]:
merged_data.info()

In [None]:
shutil.copy2(cwd + '/Data_merged.csv', cwd + '/../data_viz')