## Scraping using BeautifulSoup and selenium

If, selenium not present : pip install selenium <br>
Download latest ChromeDriver from http://chromedriver.chromium.org/downloads 

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import selenium.webdriver.support.ui as ui
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
df = []
for tr in soup.find(class_="wikitable sortable").find_all("tr"):
    data = [item.get_text(strip=True) for item in tr.find_all(["th","td"])]
    data.append(tr.find_all(["a"])[0]["href"])
    df.append(data)

df = pd.DataFrame(df)

In [3]:
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,2018rank,City,State[c],2018estimate,2010Census,Change,2016 land area,2016 population density,Location,#cite_note-5,,
1,1,New York City[d],New York,8398748,8175133,+2.74%,301.5 sq mi,780.9 km2,"28,317/sq mi","10,933/km2",40°39′49″N73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿ ...,/wiki/New_York_City
2,2,Los Angeles,California,3990456,3792621,+5.22%,468.7 sq mi,"1,213.9 km2","8,484/sq mi","3,276/km2",34°01′10″N118°24′39″W﻿ / ﻿34.0194°N 118.4108°W...,/wiki/Los_Angeles
3,3,Chicago,Illinois,2705994,2695598,+0.39%,227.3 sq mi,588.7 km2,"11,900/sq mi","4,600/km2",41°50′15″N87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿ ...,/wiki/Chicago
4,4,Houston[3],Texas,2325502,2100263,+10.72%,637.5 sq mi,"1,651.1 km2","3,613/sq mi","1,395/km2",29°47′12″N95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿ ...,/wiki/Houston


## Cleaning the data

### Adding headings

In [4]:
df = df[1:]
df = df.set_axis(['2018 Rank', 'City', 'State', '2018 estimate', '2010 census','Change','Land area sq mi','Land area sq km','Population density /sq mi','Population density /sq km','Location', "Link"], axis='columns', inplace=False)

### Analysing the dataset
#### We can see that the City column has some data inside square brackets, which is not required. So, using Regular Expressions we remove it. <br> Removing the % sign from the column Change

In [5]:
df['City'] = df['City'].str.replace(r"\[.*\]", "")
df.rename(columns={'Change': 'Percentage of change'}, inplace = True)
df['Percentage of change'] = df['Percentage of change'].str.strip('%')

#### Removing the units followed in the Land area columns which are sq mi and km2.

In [6]:
df['Land area sq mi'] = df['Land area sq mi'].str.replace('sq','')
df['Land area sq mi'] = df['Land area sq mi'].str.replace('mi','')
df['Land area sq km'] = df['Land area sq km'].str.replace('km2','')
df['Land area sq mi'] = df['Land area sq mi'].str.replace(',', '')
df['Land area sq km'] = df['Land area sq km'].str.replace(',', '')
df['Land area sq mi'] = df['Land area sq mi'].str.rstrip()
df['Land area sq km'] = df['Land area sq km'].str.rstrip()

#### Removing the commas from the numeric values

In [7]:
df['2018 estimate'] = df['2018 estimate'].str.replace(',', '')
df['2010 census'] = df['2010 census'].str.replace(',', '')

#### Removing commas and stripping the units for population density

In [8]:
df['Population density /sq mi'] = df['Population density /sq mi'].str.replace(',', '')
df['Population density /sq km'] = df['Population density /sq km'].str.replace(',', '')
df['Population density /sq mi'] = df['Population density /sq mi'].str.replace('/sq','')
df['Population density /sq mi'] = df['Population density /sq mi'].str.replace('mi','')
df['Population density /sq km'] = df['Population density /sq km'].str.replace('/km2','')
df['Population density /sq mi'] = df['Population density /sq mi'].str.rstrip()

#### Removing the unwanted information from the Location column present within the round brackets

In [9]:
df['Location'] = df['Location'].str.replace(r"\(.*\)", "")

### Cleaned dataframe

In [10]:
df

Unnamed: 0,2018 Rank,City,State,2018 estimate,2010 census,Percentage of change,Land area sq mi,Land area sq km,Population density /sq mi,Population density /sq km,Location,Link
1,1,New York City,New York,8398748,8175133,+2.74,301.5,780.9,28317,10933,40°39′49″N73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿ ...,/wiki/New_York_City
2,2,Los Angeles,California,3990456,3792621,+5.22,468.7,1213.9,8484,3276,34°01′10″N118°24′39″W﻿ / ﻿34.0194°N 118.4108°W...,/wiki/Los_Angeles
3,3,Chicago,Illinois,2705994,2695598,+0.39,227.3,588.7,11900,4600,41°50′15″N87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿ ...,/wiki/Chicago
4,4,Houston,Texas,2325502,2100263,+10.72,637.5,1651.1,3613,1395,29°47′12″N95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿ ...,/wiki/Houston
5,5,Phoenix,Arizona,1660272,1445632,+14.85,517.6,1340.6,3120,1200,33°34′20″N112°05′24″W﻿ / ﻿33.5722°N 112.0901°W...,"/wiki/Phoenix,_Arizona"
6,6,Philadelphia,Pennsylvania,1584138,1526006,+3.81,134.2,347.6,11683,4511,40°00′34″N75°08′00″W﻿ / ﻿40.0094°N 75.1333°W﻿ ...,/wiki/Philadelphia
7,7,San Antonio,Texas,1532233,1327407,+15.43,461.0,1194.0,3238,1250,29°28′21″N98°31′30″W﻿ / ﻿29.4724°N 98.5251°W﻿ ...,/wiki/San_Antonio
8,8,San Diego,California,1425976,1307402,+9.07,325.2,842.3,4325,1670,32°48′55″N117°08′06″W﻿ / ﻿32.8153°N 117.1350°W...,/wiki/San_Diego
9,9,Dallas,Texas,1345047,1197816,+12.29,340.9,882.9,3866,1493,32°47′36″N96°45′59″W﻿ / ﻿32.7933°N 96.7665°W﻿ ...,/wiki/Dallas
10,10,San Jose,California,1030119,945942,+8.90,177.5,459.7,5777,2231,37°17′48″N121°49′08″W﻿ / ﻿37.2967°N 121.8189°W...,"/wiki/San_Jose,_California"


#### Adding columns to store the Long description and Short description of all cities

In [11]:
df["LD"] = ""
df["SD"] = ""

## Scraping data using selenium from "https://www.lonelyplanet.com/search" to get Long Description and Short Description of every city above

In [12]:
url1 = "https://www.lonelyplanet.com/search"
driver = webdriver.Chrome()
wait = ui.WebDriverWait(driver, 3)
for city in df.City:
    
    driver.get(url1)
    try:
        search_box = driver.find_elements_by_css_selector('.styles__searchInput___1rY7_.styles__searchInput___2eyJw.js-gtm-search-input')[0]
        search_box.send_keys(city)
        search_box.send_keys(Keys.ENTER)
        wait.until(lambda driver: driver.find_element_by_xpath('//h3[@class="styles__heading___vDzID"]'))
        k = driver.find_element_by_xpath('//h3[@class="styles__heading___vDzID" and contains(text(), "'+city+'")]')
        k.click()
        short = driver.find_elements_by_css_selector('.masthead__strapline.masthead__strapline--visible')
        try:
            long = driver.find_element_by_xpath('//span[@class="js-intro-narrative"]/p')
        except:
            long = driver.find_element_by_xpath('//span[@class="js-intro-narrative"]')
        #print(long.text)
        #print(short[0].text)
        try:
            df.loc[df['City']==city, 'LD'] = long.text
        except:
            df.loc[df['City']==city, 'LD'] = " "

        try:
            df.loc[df['City']==city, 'SD'] = short[0].text
        except:
            df.loc[df['City']==city, 'SD'] = " "
        print(city)
    except:
        pass
        #print(city + " - X")

New York City
Los Angeles
Chicago
Houston
Phoenix
Philadelphia
San Antonio
San Diego
Dallas
San Jose
Austin
Jacksonville
Fort Worth
San Francisco
Charlotte
Indianapolis
Seattle
Denver
Boston
Detroit
Nashville
Portland
Memphis
Oklahoma City
Las Vegas
Louisville
Baltimore
Milwaukee
Albuquerque
Tucson
Fresno
Sacramento
Atlanta
Kansas City
Colorado Springs
Miami
Raleigh
Omaha
Long Beach
Virginia Beach
Oakland
Minneapolis
Tulsa
Arlington
Tampa
New Orleans
Wichita
Bakersfield
Anaheim
Honolulu
Santa Ana
Lexington
Henderson
Cincinnati
Pittsburgh
Anchorage
Lincoln
Orlando
Irvine
Newark
Durham
Laredo
Madison
Lubbock
Scottsdale
Reno
Boise
Richmond
Baton Rouge
Spokane
Des Moines
San Bernardino
Modesto
Birmingham
Fayetteville
Rochester
Huntington Beach
Grand Rapids
Montgomery
Huntsville
Augusta
Tallahassee
Tempe
McKinney
Mobile
Shreveport
Frisco
Knoxville
Brownsville
Vancouver
Fort Lauderdale
Sioux Falls
Ontario
Chattanooga
Providence
Newport News
Salem
Eugene
Fort Collins
Jackson
Lancaster
Salinas

In [13]:
df

Unnamed: 0,2018 Rank,City,State,2018 estimate,2010 census,Percentage of change,Land area sq mi,Land area sq km,Population density /sq mi,Population density /sq km,Location,Link,LD,SD
1,1,New York City,New York,8398748,8175133,+2.74,301.5,780.9,28317,10933,40°39′49″N73°56′19″W﻿ / ﻿40.6635°N 73.9387°W﻿ ...,/wiki/New_York_City,Epicenter of the arts. Architectural darling. ...,is looking skyward in awe
2,2,Los Angeles,California,3990456,3792621,+5.22,468.7,1213.9,8484,3276,34°01′10″N118°24′39″W﻿ / ﻿34.0194°N 118.4108°W...,/wiki/Los_Angeles,"Ruggedly good looking, deeply creative, with a...",is cinematic cityscapes
3,3,Chicago,Illinois,2705994,2695598,+0.39,227.3,588.7,11900,4600,41°50′15″N87°40′54″W﻿ / ﻿41.8376°N 87.6818°W﻿ ...,/wiki/Chicago,"Steely skyscrapers, top chefs, rocking festiva...",is full of architectural wonders
4,4,Houston,Texas,2325502,2100263,+10.72,637.5,1651.1,3613,1395,29°47′12″N95°23′27″W﻿ / ﻿29.7866°N 95.3909°W﻿ ...,/wiki/Houston,"Think laid-back, pick-up truck and boot-scooti...",is the heart of Texas
5,5,Phoenix,Arizona,1660272,1445632,+14.85,517.6,1340.6,3120,1200,33°34′20″N112°05′24″W﻿ / ﻿33.5722°N 112.0901°W...,"/wiki/Phoenix,_Arizona",Phoenix is Arizona's indubitable cultural and ...,
6,6,Philadelphia,Pennsylvania,1584138,1526006,+3.81,134.2,347.6,11683,4511,40°00′34″N75°08′00″W﻿ / ﻿40.0094°N 75.1333°W﻿ ...,/wiki/Philadelphia,Blessed with the glamour and culture of a big ...,is history come to life
7,7,San Antonio,Texas,1532233,1327407,+15.43,461.0,1194.0,3238,1250,29°28′21″N98°31′30″W﻿ / ﻿29.4724°N 98.5251°W﻿ ...,/wiki/San_Antonio,Tourism has been good to San Antonio and the s...,
8,8,San Diego,California,1425976,1307402,+9.07,325.2,842.3,4325,1670,32°48′55″N117°08′06″W﻿ / ﻿32.8153°N 117.1350°W...,/wiki/San_Diego,"New York has its cabbie, Chicago its bluesman ...","is surf, suds and sunsets"
9,9,Dallas,Texas,1345047,1197816,+12.29,340.9,882.9,3866,1493,32°47′36″N96°45′59″W﻿ / ﻿32.7933°N 96.7665°W﻿ ...,/wiki/Dallas,Dallas and Fort Worth may be next-door neighbo...,
10,10,San Jose,California,1030119,945942,+8.90,177.5,459.7,5777,2231,37°17′48″N121°49′08″W﻿ / ﻿37.2967°N 121.8189°W...,"/wiki/San_Jose,_California","Though culturally diverse and historical, San ...",


In [14]:
df.to_csv("Cleaned_data.csv")