In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [2]:
## Opening the CSV file from the first DataFrame
df = pd.read_csv('events_data.csv')

In [4]:
## Checking the unique values from 'city' column
df['city'].unique()

array(['seattle', 'houston', 'newyork', 'detroit', 'losangeles',
       'orlando', 'sanfrancisco', 'sandiego'], dtype=object)

In [5]:
## cleanning data from 'city' column for better Web Scrapping later
df['city'] = df['city'].replace({'newyork':'new york',
                                 'losangeles':'los angeles',
                                 'sanfrancisco':'san francisco',
                                 'sandiego':'san diego'})

In [6]:
df['city'].unique()

array(['seattle', 'houston', 'new york', 'detroit', 'los angeles',
       'orlando', 'san francisco', 'san diego'], dtype=object)

In [10]:
## Creating a new DataFrame with only a few columns from the original DataFrame
## Assigning new column names
new_df = pd.DataFrame()
new_df[['event_name', 'date', 'description', 'line_up', 'event_link', 'place', 'city', 'event_coordinates']] = df[['event_name', 'date', 'description', 'lineup', 'link', 'place', 'city', 'coordinates']]

In [17]:
## Openning the second DataFrame
df_parks = pd.read_csv('parks_data.csv')

In [18]:
df_parks.head(2)

Unnamed: 0.1,Unnamed: 0,address,id,name,isBookable,isCampground,location,latitude,longitude,url,imgUrl,numCampsites
0,0,"140 Stewart Ave, 11237 New York, New York",115196040.0,Camp Gateway- Brooklyn Ny,True,True,"East New York, New York",40.5958,-73.8858,https://www.rei.com/campgrounds/camp/115196040...,https://cdn.recreation.gov/public/images/69035...,45.0
1,1,"140 Stewart Ave, 11237 New York, New York",115197387.0,Camp Gateway - Staten Island,True,True,"Bensonhurst, New York",40.6036,-74.0588,https://www.rei.com/campgrounds/camp/115197387...,https://cdn.recreation.gov/public/2018/08/16/2...,7.0


In [19]:
## Adding some columns from the second DataFrame to our new DataFrame
new_df[['park_name', 'park_link', 'park_latitude', 'park_longitude']] = df_parks[['name', 'url', 'latitude', 'longitude']]

In [20]:
new_df.head(2)

Unnamed: 0,event_name,date,description,line_up,event_link,place,city,event_coordinates,park_name,park_link,park_latitude,park_longitude
0,The Breakfast Club - New Years Day 2020 with J...,1/1/2020,We're back! What started at ETG has been carry...,"John Tejada, Tara Brooks, Jay Tripwire, Mary D...",https://www.residentadvisor.net/events/1358885,The Monkey Loft,seattle,"('47.577129', '-122.334469')",Camp Gateway- Brooklyn Ny,https://www.rei.com/campgrounds/camp/115196040...,40.5958,-73.8858
1,Chillography and Re-bar present Dub Techno All...,1/3/2020,The perfect way to chill after the end of 2019...,"Summit Dub, Alex Flores, Waste Management, Dan...",https://www.residentadvisor.net/events/1365599,Re-Bar,seattle,"('47.6166458', '-122.3310731')",Camp Gateway - Staten Island,https://www.rei.com/campgrounds/camp/115197387...,40.6036,-74.0588


## Web Scrapping TripAdvisor for attractions in each event city

In [25]:
## Function to insert Name of a city and return the TripAdvisor´s page for that city´s attractions url

def find_attraction(cities):
    """
    Function that receives a string with the name of a city 
    and returns a url from TripAdvisor´s Attractions pages.
    
    The function uses BeautifulSoup and Selenium to retrieve the specific
    page of attractions of a city from TripAdvisor
    
    """

cities = new_df['city'].unique()
cities_links = []
for city in cities:
    url = 'https://www.tripadvisor.com.br/Attractions'
    trip = BeautifulSoup(requests.get(url).content)
    driver = webdriver.Chrome()
    driver.get(url)
    element = driver.find_element_by_class_name('typeahead_input')
    element.send_keys(city)
    element.send_keys(Keys.RETURN)
    time.sleep(5)
    cities_links.append(str(driver.current_url))
    print (driver.current_url)
    driver.close()

https://www.tripadvisor.com.br/Attractions-g60878-Activities-Seattle_Washington.html
https://www.tripadvisor.com.br/Attractions-g56003-Activities-Houston_Texas.html
https://www.tripadvisor.com.br/Attractions-g60763-Activities-New_York_City_New_York.html
https://www.tripadvisor.com.br/Attractions-g42139-Activities-Detroit_Michigan.html
https://www.tripadvisor.com.br/Attractions-g32655-Activities-Los_Angeles_California.html
https://www.tripadvisor.com.br/Attractions-g34515-Activities-Orlando_Florida.html
https://www.tripadvisor.com.br/Attractions-g60713-Activities-San_Francisco_California.html
https://www.tripadvisor.com.br/Attractions-g60750-Activities-San_Diego_California.html


In [39]:
## Function to add new column with the city attraction links

def add_att_links(row):
    """
    This function insert the values from the 'cities_links' list,
    to a new column in the DataFrame by condition from the 'city' column
    
    """
    if row['city'] == 'seattle':
        return cities_links[0]
    elif row['city'] == 'houston':
        return cities_links[1]
    elif row['city'] == 'new york':
        return cities_links[2]
    elif row['city'] == 'detroit':
        return cities_links[3]
    elif row['city'] == 'los angeles':
        return cities_links[4]
    elif row['city'] == 'orlando':
        return cities_links[5]
    elif row['city'] == 'san francisco':
        return cities_links[6]
    elif row['city'] == 'san diego':
        return cities_links[7]

In [40]:
## Applying the function and creating a new column

new_df['attracion_link'] = new_df.apply(lambda row: add_att_links(row), axis=1)

In [42]:
new_df.sample(10)

Unnamed: 0,event_name,date,description,line_up,event_link,place,city,event_coordinates,park_name,park_link,park_latitude,park_longitude,attracion_link
884,Synthicide,2/13/2020,DANCEABLE SYNTH-DRIVEN SETS BY / / / / NGHT...,"NGHTCRWLR, Octonomy, Confines, Andi",https://www.residentadvisor.net/events/1371174,Bossa Nova Civic Club,new york,"('40.69797655', '-73.9279600167073')",Wildwood Picnic Site,https://www.rei.com/campgrounds/camp/115201894...,34.2944,-118.2407,https://www.tripadvisor.com.br/Attractions-g60...
1856,Lane 8 - Brightest Lights Tour (Friday),6/5/2020,$46.001st release **SOLD OUT**,Lane 8,https://www.residentadvisor.net/events/1328537,Brooklyn Mirage,new york,"('40.7105692', '-73.9263982')",Fremont Campground,https://www.rei.com/campgrounds/camp/115196800...,34.5431,-119.8203,https://www.tripadvisor.com.br/Attractions-g60...
2674,Technometrik After Hours Still Going,3/13/2020,$15.00Advance ticket,BERLIN TECHNO,https://www.residentadvisor.net/events/1393841,TBA - Downtown LA,los angeles,,,,,,https://www.tripadvisor.com.br/Attractions-g32...
1932,Newly Centered,1/4/2020,Newly centered is a culmination of Detroit art...,"CoveLove, Jams With Dave, Trey Priest, Tammy L...",https://www.residentadvisor.net/events/1365290,Tangent Gallery,detroit,"('42.3724111616162', '-83.0638446969697')",Wildwood Picnic Site,https://www.rei.com/campgrounds/camp/115201894...,34.2944,-118.2407,https://www.tripadvisor.com.br/Attractions-g42...
455,Mike Dunn / David Kiss b2b Willy Soul in Paradise,1/18/2020,Accomplished Chicago house DJ / producer Mike ...,"Mike Dunn, David Kiss, Willy Soul",https://www.residentadvisor.net/events/1373777,Paradise Club,new york,"('40.75923265', '-73.984077268162')",Theodore Roosevelt Home,https://www.rei.com/campgrounds/camp/115208172...,40.8855,-73.5012,https://www.tripadvisor.com.br/Attractions-g60...
3237,DJ Gigola,2/29/2020,"Thrilled to have our friend DJ Gigola, who is ...","DJ Gigola , Other Artists - TBA",https://www.residentadvisor.net/events/1394472,RS94109,san francisco,"('37.7854574807692', '-122.417853961538')",,,,,https://www.tripadvisor.com.br/Attractions-g60...
757,Make Mistakes Release Party with Russo / Kels ...,2/6/2020,"celebrating the release of ""Belly of the Whale...","Derek Russo, Kels Davidson, POSI-TRACK",https://www.residentadvisor.net/events/1378782,Flowers For All Occasions,new york,"('40.6937226', '-73.9298373')",Falcon Group,https://www.rei.com/campgrounds/camp/115194711...,33.6558,-117.4603,https://www.tripadvisor.com.br/Attractions-g60...
2791,Technometrik After Hours,3/28/2020,$15.00Advance ticket,BERLIN TECHNO,https://www.residentadvisor.net/events/1400230,Downtown Loc,los angeles,,,,,,https://www.tripadvisor.com.br/Attractions-g32...
1099,13-Side: Brooklyn,2/22/2020,13-Side makes there first official NYC debut a...,"13-Side, , L&L&L Record Club, , Kyle Ryvn",https://www.residentadvisor.net/events/1393063,The Paper Box,new york,"('40.71088525', '-73.9366188656301')",Paul Little Picnic Site,https://www.rei.com/campgrounds/camp/115201859...,34.2296,-118.1769,https://www.tripadvisor.com.br/Attractions-g60...
664,Deep Root Sessions At Public Arts with Dean Mi...,1/31/2020,This coming January 31st Deep Root Sessions we...,Dean Mickoski,https://www.residentadvisor.net/events/1374969,Public Arts,new york,"('40.723034', '-73.9920355')",Camp Gateway - Sandy Hook,https://www.rei.com/campgrounds/camp/115197386...,40.4492,-73.9961,https://www.tripadvisor.com.br/Attractions-g60...


In [35]:
## Saving the new DataFrame to a CSV file

new_df.to_csv('new_df.csv')

In [16]:
new_df = pd.read_csv('new_df.csv')

In [27]:
## Removing parenthesis from coordinates column
new_df['event_coordinates'] = new_df['event_coordinates'].str.strip('(')
new_df['event_coordinates'] = new_df['event_coordinates'].str.strip(')')
new_df.head(1)

Unnamed: 0.1,Unnamed: 0,event_name,date,description,line_up,event_link,place,city,event_coordinates,park_name,park_link,park_latitude,park_longitude,attracion_link
0,0,The Breakfast Club - New Years Day 2020 with J...,1/1/2020,We're back! What started at ETG has been carry...,"John Tejada, Tara Brooks, Jay Tripwire, Mary D...",https://www.residentadvisor.net/events/1358885,The Monkey Loft,seattle,"'47.577129', '-122.334469'",Camp Gateway- Brooklyn Ny,https://www.rei.com/campgrounds/camp/115196040...,40.5958,-73.8858,https://www.tripadvisor.com.br/Attractions-g60...


In [28]:
## Spliting the 'event_coordinates' into two new columns 'event_latitude' and 'event_longitude'
new_df[['event_latitude', 'event_longitude']] = new_df.event_coordinates.str.split(',', expand=True)

In [29]:
## Deleting the 'event_coordinates' column
del new_df['event_coordinates']

In [33]:
del new_df['Unnamed: 0']

In [34]:
new_df.head(1)

Unnamed: 0,event_name,date,description,line_up,event_link,place,city,park_name,park_link,park_latitude,park_longitude,attracion_link,event_latitude,event_longitude
0,The Breakfast Club - New Years Day 2020 with J...,1/1/2020,We're back! What started at ETG has been carry...,"John Tejada, Tara Brooks, Jay Tripwire, Mary D...",https://www.residentadvisor.net/events/1358885,The Monkey Loft,seattle,Camp Gateway- Brooklyn Ny,https://www.rei.com/campgrounds/camp/115196040...,40.5958,-73.8858,https://www.tripadvisor.com.br/Attractions-g60...,'47.577129','-122.334469'
