In [1]:
import csv
import pandas as pd
import os.path

In [2]:
# List all entities available in metroscubicos.com

entidades = ['Aguascalientes', 'Baja California', 'Baja California Sur', 'Campeche', 'Chiapas', 'Chihuahua', 'Coahuila', 'Colima',
            'Distrito Federal', 'Durango', 'Estado De México', 'Guanajuato', 'Guerrero', 'Hidalgo', 'Jalisco', 'Michoacán',
             'Morelos', 'Nayarit', 'Nuevo León', 'Oaxaca', 'Puebla', 'Querétaro', 'Quintana Roo', 'San Luis Potosí',
             'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatán', 'Zacatecas']

In [3]:
# make smallcaps and change spaces for dashes, and select which entities
entidadesLower = [x.lower().replace(' ', '-') for x in entidades]

In [2]:
# define path
path = "./Resources/"

### Process for csv with links

In [5]:
# file inputs for links
inputs = [path+x+".csv" for x in entidadesLower]

In [6]:
# output file for links
links_output = path+"combinedLinks.csv"

In [7]:
# Get all column names from files if there are different columns
columnNames = []
for filename in inputs:
  with open(filename, "r", newline="", encoding="utf8") as f_in:
    reader = csv.reader(f_in)
    headers = next(reader)
    for h in headers:
      if h not in columnNames:
        columnNames.append(h)

columnNames

['ID', 'entidad', 'tipo', 'link']

In [8]:
combined_csv = pd.concat([pd.read_csv(f) for f in inputs])

In [9]:
combined_csv[combined_csv]

Unnamed: 0,ID,entidad,tipo,link
0,1425548080,aguascalientes,venta,https://casa.metroscubicos.com/MLM-1425548080-...
1,1397663987,aguascalientes,casa,https://casa.metroscubicos.com/MLM-1397663987-...
2,1398422703,aguascalientes,casa,https://casa.metroscubicos.com/MLM-1398422703-...
3,1413775652,aguascalientes,bonita,https://casa.metroscubicos.com/MLM-1413775652-...
4,1411133552,aguascalientes,casa,https://casa.metroscubicos.com/MLM-1411133552-...


In [13]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in inputs])
#export to csv
combined_csv.to_csv(links_output, index=False)

### Process for webscraped files

In [5]:
# file inputs for links and checks if file exists
inputsW = [path+x+"-webscrapeResults.csv" for x in entidadesLower]
inputsExist = [x for x in inputsW if os.path.exists(x)]

In [6]:
# output file
results_output = path+"combinedResults.csv"

In [7]:
# Get all column names from files if there are different columns
columnNames = []
for filename in inputsExist:
  with open(filename, "r", newline="", encoding="utf8") as f_in:
    reader = csv.reader(f_in)
    headers = next(reader)
    for h in headers:
      if h not in columnNames:
        columnNames.append(h)

In [8]:
# Copy data to new file
with open(results_output, "w", newline="", encoding="utf8") as f_out:
    writer = csv.DictWriter(f_out, fieldnames=columnNames)
    writer.writeheader()
    for filename in inputsExist:
        with open(filename, "r", newline="", encoding="utf8") as f_in:
            reader = csv.DictReader(f_in)  # Uses the field names in this file
            for line in reader:
                writer.writerow(line)

### Combine FinalData csvs

In [5]:
types = ['casas', 'depas', 'terrenos']
inputs = [path+x+'FinalData.csv' for x in types]
inputs

['./Resources/casasFinalData.csv',
 './Resources/depasFinalData.csv',
 './Resources/terrenosFinalData.csv']

In [7]:
# Get all column names from files if there are different columns
columnNames = []
for filename in inputs:
  with open(filename, "r", newline="", encoding="utf-8-sig") as f_in:
    reader = csv.reader(f_in)
    headers = next(reader)
    for h in headers:
      if h not in columnNames:
        columnNames.append(h)

columnNames

['tipo',
 'precio',
 'predPrices',
 'diffPrices',
 'm2Terreno',
 'm2Construccion',
 'estacionamientos',
 'Banos',
 'antiguedad',
 'colonia',
 'municipio_x',
 'entidad',
 'codigo_postal',
 'latitud',
 'longitud',
 'percentDiff']

In [8]:
# output file for combined finaldata
data_output = path+"inmueblesFinalData.csv"

In [11]:
# Copy data to new file
with open(data_output, "w", newline="", encoding="utf-8-sig") as f_out:
    writer = csv.DictWriter(f_out, fieldnames=columnNames)
    writer.writeheader()
    for filename in inputs:
        with open(filename, "r", newline="", encoding="utf-8-sig") as f_in:
            reader = csv.DictReader(f_in)  # Uses the field names in this file
            for line in reader:
                writer.writerow(line)

In [12]:
#read file to pandas
data_df = pd.read_csv(path+"inmueblesFinalData.csv")
data_df.head()

Unnamed: 0,tipo,precio,predPrices,diffPrices,m2Terreno,m2Construccion,estacionamientos,Banos,antiguedad,colonia,municipio_x,entidad,codigo_postal,latitud,longitud,percentDiff
0,casas,6500000.0,5470000.0,-1030000.0,320.0,200.0,2.0,4.0,2.0,Los Pocitos,Aguascalientes,Aguascalientes,20328.0,21.923098,-102.343269,
1,casas,1990000.0,2050000.0,60000.0,119.0,112.0,2.0,3.0,0.0,Puesta del Sol,Aguascalientes,Aguascalientes,20326.0,21.877844,-102.332253,
2,casas,2750000.0,3100000.0,350000.0,178.0,163.0,2.0,3.0,0.0,La Cantera,Aguascalientes,Aguascalientes,,,,
3,casas,1280000.0,1320000.0,40000.0,75.0,135.0,1.0,2.0,6.0,Real De Haciendas,Aguascalientes,Aguascalientes,,,,
4,casas,810000.0,1440000.0,630000.0,200.0,200.0,2.0,2.0,1.0,Jesús Teran,Aguascalientes,Aguascalientes,,,,


In [14]:
import json

In [21]:
data = data_df.to_json(orient="records")
json_string = json.dumps(data)

In [22]:
# Using a JSON string
with open('./static/js/data.json', 'w') as outfile:
    outfile.write(json_string)

In [23]:
data_dict = data = data_df.to_dict("records")

In [25]:
# Directly from dictionary
with open('./static/js/data.json', 'w') as outfile:
    json.dump(data_dict, outfile)