In [1]:
import requests
import pandas as pd
from opnieuw import retry
from requests.exceptions import HTTPError, ProxyError
from tinydb import TinyDB, Query
import concurrent.futures
import os

In [3]:
# Get Wallapop Used Cars
@retry(
    retry_on_exceptions=(ConnectionError, HTTPError, ProxyError),
    max_calls_total=4,
    retry_window_after_first_call_in_seconds=60,
)
def get_listings(step=0, offset=0, start=0, category_ids=100, latitude='40.428207', longitude='-3.679739', search_id='095355bf-701b-40b6-b37a-1740ef9aca5a'):
    url = f"https://api.wallapop.com/api/v3/cars/search?offset={offset}&experiment=not_logged_experiment&filters_source=default_filters&latitude={latitude}&start={start}&time_filter=lastMonth&order_by=most_relevance&step={step}&category_ids={category_ids}&longitude={longitude}&search_id={search_id}"
    payload = {}
    headers = {
        'Accept': 'application/json, text/plain, */*',
        'Accept-Language': 'es,en-GB;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'DeviceOS': '0',
        'MPID': '8966409109343412630',
        'Origin': 'https://es.wallapop.com',
        'Pragma': 'no-cache',
        'Referer': 'https://es.wallapop.com/',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-site',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        'X-AppVersion': '82350',
        'X-DeviceID': 'fa090a97-4fb2-4423-ae97-bdd5688c75b6',
        'X-DeviceOS': '0',
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"'
    }
    response = requests.request("GET", url, headers=headers, data=payload)
    if response.status_code == 200:
        return response.json()
    else:
        print(f'Status Code {response.status_code}')


In [4]:
# Get Wallapop User Details
@retry(
  retry_on_exceptions=(ConnectionError, HTTPError, ProxyError),
  max_calls_total=4,
  retry_window_after_first_call_in_seconds=60,
)
def get_user(user_id):
  url = f"https://api.wallapop.com/api/v3/users/{user_id}"
  payload = {}
  headers = {
    'Accept': 'application/json, text/plain, */*',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Origin': 'https://es.wallapop.com',
    'Pragma': 'no-cache',
    'Referer': 'https://es.wallapop.com/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    'accept-language': 'es,en-GB;q=0.9,en-US;q=0.8,en;q=0.7',
    'deviceos': '0',
    'mpid': '8966409109343412630',
    'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'x-appversion': '82350',
    'x-deviceid': 'fa090a97-4fb2-4423-ae97-bdd5688c75b6',
    'x-deviceos': '0'
  }
  response = requests.request("GET", url, headers=headers, data=payload)
  if response.status_code == 200:
    return response.json()
  else:
    print(f'Status Code {response.status_code}')


In [6]:
# Get Wallapop Listing details (HMTL)
@retry(
    retry_on_exceptions=(ConnectionError, HTTPError, ProxyError),
    max_calls_total=4,
    retry_window_after_first_call_in_seconds=60,
)
def get_listing_details(listing_slug):
    url = f"https://es.wallapop.com/item/{listing_slug}"
    payload = {}
    headers = {}
    response = requests.request("GET", url, headers=headers, data=payload)
    if response.status_code == 200:
        return response.content

In [7]:
def iterate_listings(coordinates):
    db=TinyDB(f'D:/Proyecto ironhack/files/datos_coordenadas/listings -{coordinates["latitude"]} - {coordinates["longitude"]}.json')
    table=db.table("listings")
    Serie=Query()
    offset = start = 0
    i = 0
    while True:
        i+=1
        print(f'Iteracion en pagina {i} - Offset {offset}')
        response = get_listings(step=0, offset=offset, start=start, category_ids=100, latitude=coordinates['latitude'], longitude=coordinates['longitude'], search_id='095355bf-701b-40b6-b37a-1740ef9aca5a')
        offset += 40
        start += 40
        if response.get('search_objects', []):
            for listing in response['search_objects']:
                car_listing = {
                    'id': listing['id'],
                    'type': listing['type'],
                    'content_id': listing['content']['id'],
                    'title': listing['content']['title'],
                    'storytelling': listing['content']['storytelling'],
                    'distance': listing['content']['distance'],
                    'user_id': listing['content']['user']['id'],
                    'kind': listing['content']['user']['kind'],
                    'flag_pending': listing['content']['flags']['pending'],
                    'flag_sold': listing['content']['flags']['sold'],
                    'flag_reserved': listing['content']['flags']['reserved'],
                    'flag_banned': listing['content']['flags']['banned'],
                    'flag_expired': listing['content']['flags']['expired'],
                    'flag_onhold': listing['content']['flags']['onhold'],
                    'flag_bumped': listing['content']['visibility_flags']['bumped'],
                    'flag_highlighted': listing['content']['visibility_flags']['highlighted'],
                    'flag_urgent': listing['content']['visibility_flags']['urgent'],
                    'flag_country_bumped': listing['content']['visibility_flags']['country_bumped'],
                    'flag_boosted': listing['content']['visibility_flags']['boosted'],
                    'price': listing['content'].get('price', None),
                    'currency': listing['content'].get('currency', None),
                    'web_slug': listing['content'].get('web_slug', None),
                    'category_id': listing['content'].get('category_id', None),
                    'brand': listing['content'].get('brand', None),
                    'model': listing['content'].get('model', None),
                    'year': listing['content'].get('year', None),
                    'version': listing['content'].get('version', None),
                    'km': listing['content'].get('km', None),
                    'engine': listing['content'].get('engine', None),
                    'gearbox': listing['content'].get('gearbox', None),
                    'horsepower': listing['content'].get('horsepower', None),
                    'favorited': listing['content'].get('favorited', None),
                    'creation_date': listing['content'].get('creation_date', None),
                    'modification_date': listing['content'].get('modification_date', None),
                    'location': listing['content'].get('location', None),
                    'supports_shipping': listing['content'].get('supports_shipping', None),
                }
                table.upsert(car_listing, Serie.id==car_listing.get('content_id', 'N/A'))
        else:
            break
    return f'Ended loop for lat: {coordinates["latitude"]}, long: {coordinates["longitude"]}'

In [8]:
def iterate_users(users):
    db=TinyDB("D:/Proyecto ironhack/files/users.json")
    table=db.table("users")
    Serie=Query()
    i=0
    for user_id in users:
        i+=1
        print(f'Obteniendo usuario {i}/{len(users)} - ID: {user_id}')
        # time.sleep(random.random()*2)
        response_user = get_user(user_id)
        table.upsert(response_user, Serie.id==response_user.get('id', 'N/A'))

In [9]:
coordenadas = pd.read_csv('D:/Proyecto ironhack/files/spain_coordenates_mesh.csv')
coordinates_list = coordenadas.to_dict('records')

In [10]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_row = {executor.submit(iterate_listings, coordinates): coordinates for coordinates in coordinates_list}
    for future in concurrent.futures.as_completed(future_to_row):
        print(future.result())

Iteracion en pagina 1 - Offset 0Iteracion en pagina 1 - Offset 0

Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Ended loop for lat: 37.52724783798975, long: -8.592725468277985
Iteracion en pagina 1 - Offset 0
Ended loop for lat: 37.92574052737508, long: -8.7272935006179
Ended loop for lat: 39.11852854496106, long: -9.147287195662392
Iteracion en pagina 1 - Offset 0
Iteracion en pagina 1 - Offset 0
Ended loop for lat: 38.43201659916498, long: -8.358176642713174
Ended loop for lat: 38.83116089215262, long: -8.495891013115404
Ended loop for lat: 39.22985784832469, long: -8.636361135472594
Ended loop for lat: 37.63242786876472, long: -8.09079378069508
Ended loop for lat: 38.03243550815533, long: -8.223162279304

In [11]:
# Directorio donde están los archivos JSON
directorio = 'D:/Proyecto ironhack/files/datos_coordenadas'

# Obtener una lista de todos los archivos JSON en el directorio
archivos = [os.path.join(directorio, f) for f in os.listdir(directorio) if f.endswith('.json')]

# Lista para almacenar los DataFrames
dataframes = []

# Leer cada archivo JSON y añadir el DataFrame a la lista
for archivo in archivos:
    try:
        df = pd.read_json(archivo)
        dataframes.append(df)
    except: 
        pass
# Concatenar todos los DataFrames en uno solo
df_combinado = pd.concat(dataframes, ignore_index=True)

# Guardar el DataFrame combinado en un nuevo archivo (opcional)
df_combinado.to_csv('D:/Proyecto ironhack/files/spain_dataset.csv', index=False)

In [12]:
df = pd.json_normalize(df_combinado['listings']).drop_duplicates()
user = list(set(df['user_id'].values))

In [14]:
user

['ejkxnrx8yr6x',
 'nz0mymee37jo',
 '3zlg0o1ov8jx',
 '4w67xvrl0wjx',
 'lqzmv09yllzv',
 'evjr19550w6k',
 'e65ypoqevpjo',
 'p8j3mw0oe869',
 '9nz0eg1v76ok',
 'p61o5eg9wdj5',
 'p61vwrng2g65',
 'k3zl19nxypjx',
 '0j2ynl083vzy',
 '4z4vqg7e9wzy',
 '4z4vqy4g0lzy',
 'qjwy2egrmgzo',
 'w67dyodee56x',
 '9nz04pl42rjo',
 'xzoln5wv25z9',
 'qzm47o3pdxzv',
 'owzyk99x7xj5',
 '8x6qmrx1xyjy',
 'qzm47gxq8lzv',
 'v4z4eooe58jy',
 '8x6qyk5g8ojy',
 'mxzovm1947j9',
 'wzvyn9ox5rzl',
 'e65ymdygrkjo',
 'pj9ydld5y26e',
 'ke65rmpkmmzo',
 'w6734dp8x8jx',
 'mznl3mnn72jn',
 'p8j3mw4k7869',
 'p8j382vkp7z9',
 '8x6q3yp28qzy',
 '8ejkw098lrjx',
 'qzm94xy84o6v',
 '9jdx2oevlnzk',
 'npj95yvm5oze',
 '9wzvnpqwmqjl',
 '36ewxw928q6d',
 '9nz0gkp5grjo',
 'kmznvn145k6n',
 'xzolnn30o4z9',
 '3zlgg8k0vpjx',
 'lqzme7qqgyjv',
 'mznge8rrv06n',
 '4z4ydd4qplzy',
 '08z8g4ygmr63',
 'w67dvxxkd86x',
 'pj9ykry7ov6e',
 'vjrkgn4q1lzk',
 'owzynewqe5j5',
 '9nz04pp7mdjo',
 'nzx5e8pe9p62',
 '9wzvp0l4lrzl',
 '0j2y84mxydzy',
 'mzng9e9vr26n',
 'v9jd209g0n6k

In [15]:
iterate_users(user)

Obteniendo usuario 1/20444 - ID: ejkxnrx8yr6x
Obteniendo usuario 2/20444 - ID: nz0mymee37jo
Obteniendo usuario 3/20444 - ID: 3zlg0o1ov8jx
Obteniendo usuario 4/20444 - ID: 4w67xvrl0wjx
Obteniendo usuario 5/20444 - ID: lqzmv09yllzv
Obteniendo usuario 6/20444 - ID: evjr19550w6k
Obteniendo usuario 7/20444 - ID: e65ypoqevpjo
Obteniendo usuario 8/20444 - ID: p8j3mw0oe869
Obteniendo usuario 9/20444 - ID: 9nz0eg1v76ok
Obteniendo usuario 10/20444 - ID: p61o5eg9wdj5
Obteniendo usuario 11/20444 - ID: p61vwrng2g65
Obteniendo usuario 12/20444 - ID: k3zl19nxypjx
Obteniendo usuario 13/20444 - ID: 0j2ynl083vzy
Obteniendo usuario 14/20444 - ID: 4z4vqg7e9wzy
Obteniendo usuario 15/20444 - ID: 4z4vqy4g0lzy
Obteniendo usuario 16/20444 - ID: qjwy2egrmgzo
Obteniendo usuario 17/20444 - ID: w67dyodee56x
Obteniendo usuario 18/20444 - ID: 9nz04pl42rjo
Obteniendo usuario 19/20444 - ID: xzoln5wv25z9
Obteniendo usuario 20/20444 - ID: qzm47o3pdxzv
Obteniendo usuario 21/20444 - ID: owzyk99x7xj5
Obteniendo usuario 22/

AttributeError: 'NoneType' object has no attribute 'get'