Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
SLACE93 committed Jul 9, 2016
2 parents ab12cf4 + fdde915 commit b2b1eea
Show file tree
Hide file tree
Showing 18 changed files with 727 additions and 134 deletions.
86 changes: 86 additions & 0 deletions ProcesamientoTwitterTimestamp.py
@@ -0,0 +1,86 @@
import pandas as pd
import dateutil.parser as dateparser
import datetime
import time


def procesamiento_Timestamp():
datos = pd.read_csv('tweets_depurated_noText.csv')
#datos['timestamstamp_ms'] = datos['timestamstamp_ms'].astype(int)
#print datos['timestamstamp_ms']
datos['weekday'] = datos['timestamstamp_ms'].apply(valorWeekDay)
datos['horas'] = datos['timestamstamp_ms'].apply(valorHoras)
datos['day'] = datos['timestamstamp_ms'].apply(valorDay)
datos['Sol_Weekend'] = datos['timestamstamp_ms'].apply(valorDayWeekend)
#print datos['timestamstamp_ms'].apply(lambda x:valorDayWeekend)
datos['Fecha'] = datos['timestamstamp_ms'].apply(Fecha)
datos['diaSemana'] = datos['timestamstamp_ms'].apply(DiaSemana)
datos['HoraTweet'] = datos['timestamstamp_ms'].apply(ZonaHoraria)

datos.to_csv('ArchivoCategorizadoTimestamp.csv')

def ZonaHoraria(x):
y = datetime.datetime.fromtimestamp(x/1000.0)
return y.strftime("%H:%M:%S")

def DiaSemana(x):
y = datetime.datetime.fromtimestamp(x/1000.0)
return y.strftime('%A')

def Fecha(x):
y = datetime.datetime.fromtimestamp(x/1000.0)
return y.date()


def valorWeekDay(x):
weekDay = datetime.datetime.fromtimestamp(x/1000.0)
return weekDay.weekday()


def valorHoras(x):
hours = datetime.datetime.fromtimestamp(x/1000.0)
return hours.hour


def valorDay(x):
hours = valorHoras(x)
if (hours >= 7 and hours <= 18):
return 1
else :
return 0



def valorDayWeekend(x):
hours = valorHoras(x)
weekday = valorWeekDay(x)
print '#####################'
print 'hours'
print hours
print 'weekday'
print weekday
print '######################'
esDia = 1 if hours <=18 and hours >= 7 else 0
esWeekend = 1 if weekday == 4 or weekday == 5 or weekday == 6 else 0
print '#####################'
print 'esDia'
print esDia
print 'esWeekend'
print esWeekend

if esDia and esWeekend:
return 1
else:
return 0


if __name__ == '__main__':
procesamiento_Timestamp()








21 changes: 21 additions & 0 deletions Tareas del Proyecto.txt
@@ -0,0 +1,21 @@
Voice - Setear el numero de tweet por voice

Cuantos usuarios hay en el dataset

Filtrar tipos de usuario -> futuro

Sacar el ranking por voice y numero de tweet

Hacer un top 5

Sacar el degree promedio del grafo

Colocarle pesos a los arcos

Filtrar stopword en Ingles

Filtrar lugares que son de personas locales

Filtrar lugares que son de personas internacionales

Top lugares de mayor helpful comment
Binary file added sourceCode/.DS_Store
Binary file not shown.
Binary file modified sourceCode/scrapyAlquileres/.DS_Store
Binary file not shown.
23 changes: 23 additions & 0 deletions sourceCode/scrapyAlquileres/ProcesamientoAlquiler.py
@@ -0,0 +1,23 @@
import pandas as pd
import string


def categoriasAlquileres():


datos = pd.read_csv('Alquileres.csv',index_col=0)

datos['precio'] = datos['precio'].astype(float)

datos['categorias'] = datos['precio'].map(lambda x: 'Expensive' if (x>300)
else 'Normal' if (x<300 and x>200)
else 'economics' if(x<200 and x>100)
else 'unexpensive')

datos.to_csv('alquileres_categorias.csv')


if __name__ == '__main__':
categoriasAlquileres()


24 changes: 24 additions & 0 deletions sourceCode/scrapyAlquileres/TransformarDatosAlquilerCsv.py
@@ -0,0 +1,24 @@
import json
import csv
import string
def getDataJson():
with open('salidaAlquileres.json') as dataFile:
dataJson = json.load(dataFile)

csvAlquileres = open('Alquileres.csv','w')
headAlquileres = 'name,longitude,latitude,precio'
csvAlquileres.write(headAlquileres)

for element in dataJson:
location = element['posicion']
lng = location[0]
lat = location[1]
if not (lng is None and lat is None):
name = element['tituloLugar']
name = string.replace(name,',','')
precio = element['precio']
line = str(name) + ',' + str(lng) + ',' + str(lat) + ',' + str(precio)+'\n'
csvAlquileres.write(line)

if __name__ == '__main__':
getDataJson()
66 changes: 66 additions & 0 deletions sourceCode/scrapyAlquileres/crawlerhelper.py
@@ -0,0 +1,66 @@
__author__ = 'cesar17'
import HTMLParser
import unicodedata
import unidecode
import string

htmlparser = HTMLParser.HTMLParser()

def is_ascii(s):
return all(ord(c) < 128 for c in s)

def clean_parsed_string(string):
if len(string) > 0:
ascii_string = string
if is_ascii(ascii_string) == False:
ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
return str(ascii_string)
else:
return None

def get_parsed_string(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:

raw_string = raw_string.strip()
raw_string = unidecode.unidecode(raw_string)
raw_string = string.replace(raw_string, '\n', '')
raw_string = string.replace(raw_string, '\"', '')
raw_string = string.replace(raw_string, ',', ' ')
return_string = htmlparser.unescape(raw_string)
return return_string

def get_parsed_rating(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:

raw_string = raw_string.strip()
raw_string = unidecode.unidecode(raw_string)
raw_string = string.replace(raw_string, '\n', '')
raw_string = string.replace(raw_string, '\"', '')
raw_string = string.replace(raw_string, ',', ' ')
raw_string = string.replace(raw_string, ' de 5 estrellas', '')
return_string = htmlparser.unescape(raw_string)
return return_string

def get_parsed_string_multiple(selector, xpath):
return_string = ''
extracted_review = selector.xpath(xpath).extract()

list_reviews = []
for review in extracted_review:
if review:
raw_string = review.strip()
raw_string = unidecode.unidecode(raw_string)
raw_string = string.replace(raw_string, '\n', '')
raw_string = string.replace(raw_string, '\"', '')

list_reviews.append(raw_string)

return list_reviews
22 changes: 7 additions & 15 deletions sourceCode/scrapyAlquileres/itemsAlquiler.py
Expand Up @@ -13,28 +13,20 @@ class itemsAlquiler(sc.Item):
out_processor = Join(),

)
tituloComentario = sc.Field(
input_processor = MapCompose(unicode.strip,
lambda x:unidecode.unidecode(x),
lambda y:string.replace(y,'\n','')),
out_processor = Join(),
)
latitud = sc.Field()
longitud = sc.Field()
posicion = sc.Field()
estrellas = sc.Field(
input_processor = MapCompose(unicode.strip,
lambda x:x.replace('de 5 estrellas','')),
out_processor = Join(),
)
comentarios = sc.Field(
input_processor = MapCompose(unicode.strip,
lambda x:unidecode.unidecode(x),
lambda y:string.replace(y,'\n','')),
out_processor = Join(),

)

precio = sc.Field(
input_processor = MapCompose(unicode.strip,
lambda x:unidecode.unidecode(x)),
out_processor = Join(),
)
categoria = sc.Field()
itemsReviews = sc.Field()
class itemsReviews(sc.Item):
tituloComentario = sc.Field()
comentarios = sc.Field()

0 comments on commit b2b1eea

Please sign in to comment.