Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
SLACE93 committed Jul 6, 2016
2 parents 4ab5253 + b5e88a6 commit 392183a
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 48 deletions.
21 changes: 21 additions & 0 deletions sourceCode/actividades_pandas.py
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-

__author__ = 'josanvel'

import pandas as pd
import numpy as np
from pandas import Series

def add_category_actividades():
actiidades = pd.read_csv("../data/TripAdvisor/nuevo_data_actividades.csv")

#Convierto los datos de los rating a FLOAT
actiidades['rating'] = actiidades['rating'].astype(float)
actiidades['category'] = actiidades['rating'].map(lambda x: 'Excelente' if (x > 4 and x <= 5)
else 'Bueno' if (x > 2 and x <= 4)
else 'Regular')
#Guardo el dataframe de actividades en un nuevo archivo CSV
actiidades.to_csv('../data/TripAdvisor/data_actividades_categorizado.csv')

if __name__ == '__main__':
add_category_actividades()
46 changes: 46 additions & 0 deletions sourceCode/convert_json_actividad.py
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-

__author__ = 'josanvel'

import json
import csv
import string

def get_fields_json():
#Abro el archivo JSON de las Actividades
with open('scrapy_hotel/scrapy_hotel/spiders/data_actividades_tag.json') as data_file:
data_json = json.load(data_file)

#Creo el archivo CSV
csv_file = open('../data/TripAdvisor/nuevo_data_actividades.csv', 'w')
#Creo la cabecera del CSV
cabecera_json = 'name,longitude,latitude,rating,NoReviews,tags\n'
#Guardo la cabecera en el archivo CSV
csv_file.write(cabecera_json)

for element in data_json:
#Obtengo la ubicacion del Hotel en cooredenadas
location = element['location']
lng = location[0]
lat = location[1]

if not (lng is None and lat is None):
name = element['name'] #Obtengo el nombre de la Actividad
name = string.replace(name, ',', '')
rating = element['rating'] #Obtengo el raiting de la Actividad
reviews = element['reviews'] #Obtengo los reviews de la Actividad
NoReviews = len(reviews) #Obtengo el numoero de reviews de la Actividad
tags = element['tag'] #Obtengo los tags de la Actividad

line_tags = ""
for tag in tags:
line_tags = line_tags+'$'+tag #Obtengo TODOS los tags de la Actividad $aaaaa$ggggg$ooooo

#Concateno la linea del CSV
line = str(name)+','+str(lng)+','+str(lat)+','+str(rating)+','+str(NoReviews)+','+line_tags+'\n'
#Guardo la linea en el archivo CSV
if rating:
csv_file.write(line)

if __name__ == '__main__':
get_fields_json()
41 changes: 41 additions & 0 deletions sourceCode/convert_json_hotel.py
@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-

__author__ = 'josanvel'

import json
import csv
import string

def get_fields_json():
#Abro el archivo JSON de los hoteles
with open('scrapy_hotel/scrapy_hotel/spiders/data_hotel_completo.json') as data_file:
data_json = json.load(data_file)

#Creo el archivo CSV
csv_file = open('../data/TripAdvisor/nuevo_data_hotel.csv', 'w')
#Creo la cabecera del CSV
cabecera_json = 'name,longitude,latitude,rating,NoReviews\n'
#Guardo la cabecera en el archivo CSV
csv_file.write(cabecera_json)

for element in data_json:
#Obtengo la ubicacion del Hotel en cooredenadas
location = element['location']
lng = location[0]
lat = location[1]

if not (lng is None and lat is None):
name = element['name'] #Obtengo el nombre de un hotel
name = string.replace(name, ',', '')
rating = element['rating'] #Obtengo el raiting de un hotel
reviews = element['reviews'] #Obtengo los reviews de un hotel
NoReviews = len(reviews) #Obtengo el numoero de reviews de un hotel

#Concateno la linea del CSV
line = str(name)+','+str(lng)+','+str(lat)+','+str(rating)+','+str(NoReviews)+'\n'
#Guardo la linea en el archivo CSV
if rating:
csv_file.write(line)

if __name__ == '__main__':
get_fields_json()
21 changes: 21 additions & 0 deletions sourceCode/hoteles_pandas.py
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-

__author__ = 'josanvel'

import pandas as pd
import numpy as np
from pandas import Series

def add_category_hotels():
hotels = pd.read_csv("../data/TripAdvisor/nuevo_data_hotel.csv")

#Convierto los datos de los rating a FLOAT
hotels['rating'] = hotels['rating'].astype(float)
hotels['category'] = hotels['rating'].map(lambda x: 'Excelente' if (x > 4 and x <= 5)
else 'Bueno' if (x > 2 and x <= 4)
else 'Regular')
#Guardo el dataframe de actividades en un nuevo archivo CSV
hotels.to_csv('../data/TripAdvisor/data_hotel_categorizado.csv')

if __name__ == '__main__':
add_category_hotels()
26 changes: 7 additions & 19 deletions sourceCode/scrapy_hotel/scrapy_hotel/items.py
@@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-

__author__ = 'josanvel'

# Define here the models for your scraped items
#
# See documentation in:
Expand All @@ -11,25 +13,9 @@
import unidecode
import string


class ScrapyItem(Item):
class TripAdvisorReviewItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()

name = scrapy.Field(
input_processor = MapCompose(unicode.strip, lambda x: unidecode.unidecode(x)),
output_processor = Join(),
)
rating = Field(
input_processor = MapCompose(unicode.strip, lambda x: x.replace(' de 5 estrellas','')),
output_processor = Join(),
)
location = Field()
review = Field()


class TripAdvisorReviewItem(Item):

date = Field()
title = Field(
input_processor = MapCompose(unicode.strip,
Expand All @@ -47,7 +33,8 @@ class TripAdvisorReviewItem(Item):
)

class TripAdvisorItem(Item):

# define the fields for your item here like:
# name = scrapy.Field()
url = Field()
name = Field(
input_processor = MapCompose(unicode.strip, lambda x: unidecode.unidecode(x)),
Expand All @@ -58,4 +45,5 @@ class TripAdvisorItem(Item):
output_processor = Join(),
)
location = Field()
reviews = Field()
reviews = Field()
tag = Field()
2 changes: 2 additions & 0 deletions sourceCode/scrapy_hotel/scrapy_hotel/spiders/crawlerhelper.py
Expand Up @@ -29,6 +29,7 @@ def get_parsed_string(selector, xpath):
raw_string = unidecode.unidecode(raw_string)
raw_string = string.replace(raw_string, '\n', '')
raw_string = string.replace(raw_string, '\"', '')
raw_string = string.replace(raw_string, ',', ' ')
return_string = htmlparser.unescape(raw_string)
return return_string

Expand All @@ -43,6 +44,7 @@ def get_parsed_raiting(selector, xpath):
raw_string = unidecode.unidecode(raw_string)
raw_string = string.replace(raw_string, '\n', '')
raw_string = string.replace(raw_string, '\"', '')
raw_string = string.replace(raw_string, ',', ' ')
raw_string = string.replace(raw_string, ' de 5 estrellas', '')
return_string = htmlparser.unescape(raw_string)
return return_string
Expand Down
@@ -1,5 +1,7 @@
# -*- encoding: utf-8 -*-
# author: Jose Velez

__author__ = 'josanvel'

import re
import time
from scrapy.spider import BaseSpider
Expand All @@ -15,7 +17,7 @@
# Constants.
# Max reviews pages to crawl.
# Reviews collected are around: 5 * MAX_REVIEWS_PAGES
MAX_REVIEWS_PAGES = 50
MAX_REVIEWS_PAGES = 500

class tripAdvisorScrapper(BaseSpider):
name = "tripadvisor_hotel"
Expand All @@ -32,7 +34,7 @@ def parse(self, response):
sel = Selector(response)
#Selector de todos las Actividades
snode_cosas_que_hacers = sel.xpath('//div[@id="FILTERED_LIST"]//div[starts-with(@class, "entry")]')

# Iteracion de cada actividad en la pagina semilla
for snode_cosas_que_hacer in snode_cosas_que_hacers:
#========Instanciar el item Actividades
Expand All @@ -41,22 +43,19 @@ def parse(self, response):
url_name = clean_parsed_string(get_parsed_string(snode_cosas_que_hacer, './/div[contains(@class, "property_title")]/a/@href'))
#========Obtener el nombre del Actividades
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_cosas_que_hacer, './/div[contains(@class, "property_title")]/a/text()'))

if url_name:
#========Concatenar la URL del Actividades
url_name = self.base_uri + url_name
yield Request(url=url_name, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_search_page)

#Obtener la URL de la pagina siguiente (PAGINACION)
next_page_actividades = clean_parsed_string(get_parsed_string(sel, '//a[starts-with(@class, "nav next rndBtn ui_button primary taLnk")]/@href'))

if next_page_actividades and len(next_page_actividades) > 0:
#========Concatenar la URL de la paginacion de Actividades
url_actividades = self.base_uri + next_page_actividades
yield Request(url=url_actividades, meta={'tripadvisor_item': tripadvisor_item, 'counter_page_actividades' : 0}, callback=self.parse_pagination)
# Limites de numero de paginacion
else:
yield tripadvisor_item


#Funcion que obtiene los elementos del review
Expand All @@ -72,7 +71,6 @@ def parse_pagination(self, response):

#Selector de todas las Actividades
snode_cosas_que_hacers = sel.xpath('//div[@id="FILTERED_LIST"]//div[starts-with(@class, "entry")]')

# Iteracion de cada Actividades en la pagina semilla
for snode_cosas_que_hacer in snode_cosas_que_hacers:
#========Instanciar el item Actividades
Expand All @@ -81,23 +79,19 @@ def parse_pagination(self, response):
url_name = clean_parsed_string(get_parsed_string(snode_cosas_que_hacer, './/div[contains(@class, "property_title")]/a/@href'))
#========Obtener el nombre del Actividades
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_cosas_que_hacer, './/div[contains(@class, "property_title")]/a/text()'))

if url_name:
#========Concatenar la URL del Actividades
url_name = self.base_uri + url_name
yield Request(url=url_name, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_search_page)

#Obtener la URL de la pagina siguiente (PAGINACION)
next_page_actividades = clean_parsed_string(get_parsed_string(sel, '//a[starts-with(@class, "nav next rndBtn ui_button primary taLnk")]/@href'))

if next_page_actividades and len(next_page_actividades) > 0:
#========Concatenar la URL de la paginacion de Actividades
url_actividades = self.base_uri + next_page_actividades
yield Request(url=url_actividades, meta={'tripadvisor_item': tripadvisor_item, 'counter_page_actividades' : counter_page_actividades}, callback=self.parse_pagination)
# Limites de numero de paginacion
else:
yield tripadvisor_item



# Buscar los raiting, ubucacion y los links de los reviews.
Expand All @@ -111,16 +105,32 @@ def parse_search_page(self, response):
tripadvisor_item['rating'] = clean_parsed_string(get_parsed_raiting(sel, '//*[@id="HEADING_GROUP"]/div/div[2]/div[1]/div/span/img/@alt'))
lng = clean_parsed_string(get_parsed_string(sel, '//*[@id="NEARBY_TAB"]/div/div[1]/div[3]/@data-lng'))
lat = clean_parsed_string(get_parsed_string(sel, '//*[@id="NEARBY_TAB"]/div/div[1]/div[3]/@data-lat'))
pos = [str(lng), str(lat)] #Almacena la Longitud y la Latitud del Actividades y lo guarda en una lista
if lng is None and lat is None:
pos = [lng, lat]
else:
pos = [str(lng), str(lat)] #Almacena la Longitud y la Latitud del Actividades y lo guarda en una lista
#========Obtener la ubicacion del Actividades
tripadvisor_item['location'] = pos

#========Obtener el tag del Actividades
list_tags = []
#========Obtengo el selector de tags
tags = sel.xpath('//*[@id="HEADING_GROUP"]/div/div[3]/div/div[@class="detail"]/a')
for tag in tags:
#===============Obtengo string del tag
elem = clean_parsed_string(get_parsed_string(tag, './text()'))
#===============Almaceno el tag en una lista de tags
list_tags.append(elem)

#========Obtener los tags de Actividades
tripadvisor_item['tag'] = list_tags

expanded_review_url = clean_parsed_string(get_parsed_string(sel, '//div[contains(@class, "basic_review")]//a/@href'))
if expanded_review_url:
#========Concatenar la URL del titulo del review
url_review = self.base_uri + expanded_review_url
yield Request(url=url_review, meta={'tripadvisor_item': tripadvisor_item, 'counter_page_review' : 0}, callback=self.parse_fetch_review)
#Aunque no tenga Review aun asi guarda registro del Hotel
#Aunque no tenga Review aun asi se guarda registro de la Actividad
else:
yield tripadvisor_item

Expand All @@ -147,6 +157,7 @@ def parse_fetch_review(self, response):
tripadvisor_review_item['title'] = clean_parsed_string(get_parsed_string(snode_review, 'div[@class="quote"]/text()'))
if tripadvisor_review_item['title'] is None:
tripadvisor_review_item['title'] = clean_parsed_string(get_parsed_string(snode_review, 'div[@class="quote"]/a/span/text()'))

#========Obtener la descripcion del review del Actividades
tripadvisor_review_item['description'] = get_parsed_string_multiple(snode_review, 'div[@class="entry"]/p/text()')
#========Guardar el titulo y la descripcion del review del Actividades
Expand Down

0 comments on commit 392183a

Please sign in to comment.