Merge branch 'master' of https://github.com/KevinOrtiz/An-lisis-Explo…

…ratorio-de-datos
KevinOrtiz · Jul 9, 2016 · b2b1eea · b2b1eea
2 parents ab12cf4 + fdde915
commit b2b1eea
Show file tree

Hide file tree

Showing 18 changed files with 727 additions and 134 deletions.
diff --git a/ProcesamientoTwitterTimestamp.py b/ProcesamientoTwitterTimestamp.py
@@ -0,0 +1,86 @@
+import pandas as pd
+import dateutil.parser as dateparser
+import datetime
+import time
+
+
+def procesamiento_Timestamp():
+	datos = pd.read_csv('tweets_depurated_noText.csv')
+	#datos['timestamstamp_ms'] = datos['timestamstamp_ms'].astype(int)
+	#print datos['timestamstamp_ms']
+	datos['weekday'] = datos['timestamstamp_ms'].apply(valorWeekDay)
+	datos['horas'] = datos['timestamstamp_ms'].apply(valorHoras)
+	datos['day'] = datos['timestamstamp_ms'].apply(valorDay)
+	datos['Sol_Weekend'] = datos['timestamstamp_ms'].apply(valorDayWeekend)
+	#print  datos['timestamstamp_ms'].apply(lambda x:valorDayWeekend)
+	datos['Fecha'] = datos['timestamstamp_ms'].apply(Fecha)
+	datos['diaSemana'] = datos['timestamstamp_ms'].apply(DiaSemana)
+	datos['HoraTweet'] = datos['timestamstamp_ms'].apply(ZonaHoraria)
+
+	datos.to_csv('ArchivoCategorizadoTimestamp.csv')
+
+def ZonaHoraria(x):
+	y = datetime.datetime.fromtimestamp(x/1000.0)
+	return y.strftime("%H:%M:%S")
+
+def DiaSemana(x):
+	y = datetime.datetime.fromtimestamp(x/1000.0)
+	return y.strftime('%A')
+
+def Fecha(x):
+	 y = datetime.datetime.fromtimestamp(x/1000.0)
+	 return y.date()
+
+
+def valorWeekDay(x):
+	weekDay = datetime.datetime.fromtimestamp(x/1000.0)
+	return weekDay.weekday()
+
+
+def valorHoras(x):
+	hours = datetime.datetime.fromtimestamp(x/1000.0)
+	return hours.hour
+
+
+def valorDay(x):
+	hours = valorHoras(x)
+	if (hours >= 7 and hours <= 18):
+		return 1
+	else :
+		return 0
+
+
+
+def valorDayWeekend(x):
+	hours = valorHoras(x)
+	weekday = valorWeekDay(x)
+	print '#####################'
+	print 'hours'
+	print hours
+	print 'weekday'
+	print weekday
+	print '######################'
+	esDia = 1 if hours <=18 and hours >= 7 else 0
+	esWeekend = 1 if weekday == 4 or weekday == 5 or weekday == 6 else 0 
+	print '#####################'
+	print 'esDia'
+	print esDia
+	print 'esWeekend'
+	print esWeekend
+
+	if esDia and esWeekend:
+		return 1
+	else:
+		return 0
+
+
+if __name__ == '__main__':
+	procesamiento_Timestamp()
+
+
+
+
+
+
+
+
diff --git a/Tareas del Proyecto.txt b/Tareas del Proyecto.txt
@@ -0,0 +1,21 @@
+Voice -  Setear el numero de tweet por voice
+
+Cuantos usuarios hay en el dataset
+
+Filtrar tipos de usuario -> futuro
+
+Sacar el ranking por voice y numero de tweet
+
+Hacer un top 5
+
+Sacar el degree promedio del grafo
+
+Colocarle pesos a los arcos
+
+Filtrar stopword en Ingles
+
+Filtrar lugares que son de personas locales
+
+Filtrar lugares que son de personas internacionales
+
+Top lugares de mayor helpful comment
diff --git a/sourceCode/.DS_Store b/sourceCode/.DS_Store
diff --git a/sourceCode/scrapyAlquileres/.DS_Store b/sourceCode/scrapyAlquileres/.DS_Store
diff --git a/sourceCode/scrapyAlquileres/ProcesamientoAlquiler.py b/sourceCode/scrapyAlquileres/ProcesamientoAlquiler.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import string
+
+
+def categoriasAlquileres():
+
+
+	datos = pd.read_csv('Alquileres.csv',index_col=0)
+
+	datos['precio'] = datos['precio'].astype(float) 
+
+	datos['categorias'] = datos['precio'].map(lambda x: 'Expensive' if (x>300)
+										else 'Normal' if (x<300 and x>200)
+										else 'economics' if(x<200 and x>100)
+										else 'unexpensive')
+
+	datos.to_csv('alquileres_categorias.csv')
+
+
+if __name__ == '__main__':
+	categoriasAlquileres()
+
+
diff --git a/sourceCode/scrapyAlquileres/TransformarDatosAlquilerCsv.py b/sourceCode/scrapyAlquileres/TransformarDatosAlquilerCsv.py
@@ -0,0 +1,24 @@
+import json
+import csv
+import string
+def getDataJson():
+	with open('salidaAlquileres.json') as dataFile:
+		dataJson = json.load(dataFile)
+
+	csvAlquileres = open('Alquileres.csv','w')
+	headAlquileres = 'name,longitude,latitude,precio'
+	csvAlquileres.write(headAlquileres)
+
+	for element in dataJson:
+		location = element['posicion']
+		lng = location[0]
+		lat = location[1]
+		if not (lng is None and lat is None):
+			name = element['tituloLugar']
+			name = string.replace(name,',','')
+			precio = element['precio']
+			line = str(name) + ',' + str(lng) + ',' + str(lat) + ',' + str(precio)+'\n'
+			csvAlquileres.write(line)
+
+if __name__ == '__main__':
+	getDataJson()
diff --git a/sourceCode/scrapyAlquileres/crawlerhelper.py b/sourceCode/scrapyAlquileres/crawlerhelper.py
@@ -0,0 +1,66 @@
+__author__ = 'cesar17'
+import HTMLParser
+import unicodedata
+import unidecode
+import string
+
+htmlparser = HTMLParser.HTMLParser()
+
+def is_ascii(s):
+	return all(ord(c) < 128 for c in s)
+
+def clean_parsed_string(string):
+	if len(string) > 0:
+		ascii_string = string
+		if is_ascii(ascii_string) == False:
+			ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
+		return str(ascii_string)
+	else:
+		return None
+
+def get_parsed_string(selector, xpath):
+	return_string = ''
+	extracted_list = selector.xpath(xpath).extract()
+	if len(extracted_list) > 0:
+		raw_string = extracted_list[0].strip()
+		if raw_string is not None:
+
+			raw_string = raw_string.strip()
+			raw_string = unidecode.unidecode(raw_string)
+			raw_string = string.replace(raw_string, '\n', '')
+			raw_string = string.replace(raw_string, '\"', '')
+			raw_string = string.replace(raw_string, ',', ' ')
+			return_string = htmlparser.unescape(raw_string)
+	return return_string
+
+def get_parsed_rating(selector, xpath):
+	return_string = ''
+	extracted_list = selector.xpath(xpath).extract()
+	if len(extracted_list) > 0:
+		raw_string = extracted_list[0].strip()
+		if raw_string is not None:
+
+			raw_string = raw_string.strip()
+			raw_string = unidecode.unidecode(raw_string)
+			raw_string = string.replace(raw_string, '\n', '')
+			raw_string = string.replace(raw_string, '\"', '')
+			raw_string = string.replace(raw_string, ',', ' ')
+			raw_string = string.replace(raw_string, ' de 5 estrellas', '')
+			return_string = htmlparser.unescape(raw_string)
+	return return_string
+
+def get_parsed_string_multiple(selector, xpath):
+	return_string = ''
+	extracted_review = selector.xpath(xpath).extract()
+
+	list_reviews = []
+	for review in extracted_review:
+		if review:
+			raw_string = review.strip()
+			raw_string = unidecode.unidecode(raw_string)
+			raw_string = string.replace(raw_string, '\n', '')
+			raw_string = string.replace(raw_string, '\"', '')
+
+			list_reviews.append(raw_string)
+
+	return list_reviews
diff --git a/sourceCode/scrapyAlquileres/itemsAlquiler.py b/sourceCode/scrapyAlquileres/itemsAlquiler.py
@@ -13,28 +13,20 @@ class itemsAlquiler(sc.Item):
         			out_processor = Join(),
 
     )
-    tituloComentario = sc.Field(
-        			input_processor = MapCompose(unicode.strip,
-        				lambda x:unidecode.unidecode(x),
-        				lambda y:string.replace(y,'\n','')),
-        			out_processor = Join(),
-   )
-    latitud = sc.Field()
-    longitud = sc.Field()
+    posicion = sc.Field()
     estrellas = sc.Field(
         			input_processor = MapCompose(unicode.strip,
         			lambda x:x.replace('de 5 estrellas','')),
         			out_processor = Join(),
    )
-    comentarios = sc.Field(
-        			input_processor = MapCompose(unicode.strip,
-        				lambda x:unidecode.unidecode(x),
-        				lambda y:string.replace(y,'\n','')),
-        			out_processor = Join(),
-
-   )
+
     precio = sc.Field(
         			input_processor = MapCompose(unicode.strip,
         			lambda x:unidecode.unidecode(x)),
         			out_processor = Join(),
     )
+    categoria = sc.Field()
+    itemsReviews = sc.Field()
+class itemsReviews(sc.Item):
+    tituloComentario = sc.Field()
+    comentarios = sc.Field()