### Libraries

In [1]:
# data colelction and preprocessing
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import time 

# for data visualisation and statistical analysis
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set_style("white")
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from pylab import rcParams


%matplotlib inline

## Download  Url

In [3]:
def get_ads_urls():
    urls_list = []
    # define the basic url to crawl on
    basic_url = "https://autos.mercadolibre.com.ec/autos-camionetas/"
    supr_link = "_DisplayType_LF"
    pref_link = "_Desde_"
    con = [pref_link + str(sub) for sub in list(np.arange(1,500, 50))]
    con[0] = ""
    con = [ sub + supr_link for sub in con]
    
    for i in con:
        url = basic_url+str(i)
        r  = requests.get(url)
        data = r.text
        # transform it to bs object
        soup = BeautifulSoup(data, "lxml")
        # loop over page links
        for div in soup.findAll('div', {'class': 'ui-search-result__image'}):
            a = div.findAll('a')[0]
            urls_list.append(a.get('href'))
        time.sleep(5)
    df = pd.DataFrame(data={"url": urls_list})
    print(df.head())
    df.to_csv("data/ads_urls_mercado.csv", sep=',',index=False)

In [4]:
get_ads_urls()

                                                 url
0  https://auto.mercadolibre.com.ec/MEC-426943093...
1  https://auto.mercadolibre.com.ec/MEC-427299436...
2  https://auto.mercadolibre.com.ec/MEC-427277173...
3  https://auto.mercadolibre.com.ec/MEC-427454101...
4  https://auto.mercadolibre.com.ec/MEC-427438910...


## Download Data de Url's

In [5]:
def Filter(string, substr): 
    return [str for str in string if
             any(sub in str for sub in substr)] 

In [6]:
def scrap_ad_data(ad_url):
    r = requests.get(ad_url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    target_component = soup.findAll(["li","span","div"],  {"class": ["specs-wrapper","price-tag-motors"]})

    results=[]
    for i in target_component:
        results.append(''.join(i.findAll(text=True)).replace('\n',' '))
                         
    ult=[]
    for i in range(0,len(results)):
        ult.extend(results[i].split("  "))
    info= [i for i in ult if len(i)>0 ]    

    info=Filter(info, ['Marca',"Modelo",'Kilómetros',"Año","U$S"]) 
    return info


In [7]:
def scrap_ad_list(ad_url):
    r = requests.get(ad_url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    #target_component_list = soup.findAll(["div","ul"],  {"class": "attribute-list"})
    target_component_list = soup.findAll(["li","span","div","ul"],  {"class": ["attribute-list","price-tag-motors"]})
    
    results=[]
    for i in target_component_list:
        results.append(''.join(i.findAll(text=True)).replace('\n\t','').replace('\t:','').replace('\n',' '))    

    ult=[]
    for i in range(0,len(results)):
        ult.extend(results[i].split("  "))
    info= [i for i in ult if len(i)>0 ]    

    info=Filter(info, ['Marca',"Modelo",'Kilómetros',"Año","U$S"]) 
    return info


In [8]:
def write_data_to_csv(data):
    with open("./data/output_Mercado.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(data)

In [9]:
def load_dataframe_data():
    urls_data = pd.read_csv("./data/ads_urls_mercado.csv")
    final_result = []
    for index, row in urls_data.iterrows():
        final_result.append(scrap_ad_data(row['url']))
    print('Scrapping data finished')
    return (final_result)

In [10]:
def load_dataframe_list():
    urls_data = pd.read_csv("./data/ads_urls_mercado.csv")
    final_result = []
    for index, row in urls_data.iterrows():
        final_result.append(scrap_ad_list(row['url']))
    print('Scrapping list finished')
    
    return (final_result)

## Processing Data

In [27]:
scrap_data=load_dataframe_data()
df_scrap_data = pd.DataFrame(scrap_data,columns=['carBrand','mileage',"carModel","year","price"])
scrap_data=df_scrap_data.dropna().copy()

display(scrap_data.head(10),scrap_data.shape)

Unnamed: 0,carBrand,mileage,carModel,year,price
1,Marca Ford,Kilómetros 129600 km,Modelo Ecosport,Año 2008,U$S 12.500
2,Marca Kia,Kilómetros 112000 km,Modelo Rio r,Año 2014,U$S 12.000
4,Marca Kia,Kilómetros 56000 km,Modelo Sorento,Año 2005,U$S 15.950
7,Marca JAC,Kilómetros 45000 km,Modelo S3,Año 2019,U$S 16.850
8,Marca Volkswagen,Kilómetros 63000 km,Modelo Amarok,Año 2017,U$S 27.000
14,Marca Ford,Kilómetros 225000 km,Modelo FIESTA POWER,Año 2004,U$S 4.900
15,Marca Ford,Kilómetros 111000 km,Modelo ESCAPE HIBRIDO,Año 2010,U$S 13.500
17,Marca Kia,Kilómetros 143000 km,Modelo GRAND CARIVAL,Año 2013,U$S 15.500
18,Marca Renautl,Kilómetros 153000 km,Modelo Sandero,Año 2010,U$S 8.590
19,Marca Volkswagen,Kilómetros 10000 km,Modelo CRAFTER AMAROK DIESEL,Año 2013,U$S 700


(178, 5)

In [34]:
scrap_list.isnull().sum()

carBrand    0
carModel    0
year        0
mileage     0
price       0
dtype: int64

In [32]:
#scrap_list=load_dataframe_list()
df_scrap_list = pd.DataFrame(scrap_list,columns=['carBrand','carModel',"year","mileage","price","description"])

scrap_list=df_scrap_list.drop(['description'], axis='columns')

display(scrap_list.head(10),scrap_list.shape)

Unnamed: 0,carBrand,carModel,year,mileage,price
0,Marca Nissan murano,Modelo Murano 2007,Año 2007,Kilómetros 198.000 km,U$S 13.000
3,Marca 2001,Modelo Gol,Año 2001,Kilómetros 239.000 km,U$S 6.500
5,Marca BMW,Modelo BMW SERIE 550i,Año 2006,Kilómetros 160.000 km,U$S 21.000
6,Marca Ford,Modelo Explorer,Año 2007,Kilómetros 212.000 km,U$S 14.800
9,Marca Volkswagen,Modelo Jetta,Año 2009,Kilómetros 145.000 km,U$S 12.900
10,Marca Chevrolet,Modelo Sail,Año 2018,Kilómetros 33.800 km,U$S 16.500
11,Marca Mazda,Modelo CX3 Entry 2.0,Año 2020,Kilómetros 4.800 km,U$S 27.000
12,Marca Suzuki,Modelo Vitara SZ,Año 2010,Kilómetros 151.000 km,U$S 13.200
13,Marca Honda,Modelo CR-V,Año 2008,Kilómetros 284.000 km,U$S 14.300
16,Marca Suzuki,Modelo S-Cross,Año 2018,Kilómetros 50.000 km,U$S 17.500


(320, 5)

In [35]:
df_info=pd.concat([scrap_data,scrap_list])
def dataProcessing():

    df_info["carBrand"]=df_info['carBrand'].map(lambda x: str(x)[6:])
    df_info["carModel"]=df_info['carModel'].map(lambda x: str(x)[7:])

    df_info['mileage']=df_info['mileage'].str.extract('(\d+)') 
    df_info['year']=df_info['year'].str.extract('(\d+)') 
    df_info['price']=df_info['price'].str.replace(".","").str.extract('(\d+)') 

    df_info[["mileage","year","price"]]=df_info[["mileage","year","price"]].apply(pd.to_numeric)

    return df_info

In [36]:
info=dataProcessing()
display(info.head(3),info.shape)

Unnamed: 0,carBrand,mileage,carModel,year,price
1,Ford,129600,Ecosport,2008,12500
2,Kia,112000,Rio r,2014,12000
4,Kia,56000,Sorento,2005,15950


(498, 5)

## Descriptive analysis

## Pruebas Unitarias 

In [None]:
def scrap_ad_data(ad_url):
    r = requests.get(ad_url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    target_component = soup.findAll(["li","span","div"],  {"class": ["specs-wrapper","price-tag-motors"]})
    target_component2 = soup.findAll(["li","span","div"],  {"class": ["ui-dropdown"]})
    

    
    #target_component_loc_text =soup.find("div", class_='vip-card').find("span").findAll(text=True)
    results=[]
    for i in target_component:
        results.append(''.join(i.findAll(text=True)).replace('\n',' '))
                         
    ult=[]
    for i in range(0,len(results)):
        ult.extend(results[i].split("  "))
    info= [i for i in ult if len(i)>0 ]    

    #info.extend(target_component_loc_text)
    info=Filter(info, ['Marca',"Modelo",'Kilómetros',"Año","U$S"]) 
    return info


In [None]:
x=scrap_ad_data('https://auto.mercadolibre.com.ec/MEC-427422595-ford-fiesta-power-1600-matriculado-2019-potente-veloz-2004-_JM#position=2&type=item&tracking_id=ad0c2b41-c8b5-484a-b682-99cb6fcb4c79')
x

In [None]:
r = requests.get("https://auto.mercadolibre.com.ec/MEC-426964946-ford-explorer-ta-automatica-_JM#position=5&type=item&tracking_id=2e718937-66e9-4834-8e6d-03266ffc58f7")
#r = requests.get("https://auto.mercadolibre.com.ec/MEC-427666865-grand-vitara-sz-24-traccion-4x2-automovil-_JM#position=2&type=item&tracking_id=2e718937-66e9-4834-8e6d-03266ffc58f7")
data = r.text
soup = BeautifulSoup(data, "html.parser")
#target_component = soup.findAll(["li","span","div"],  {"class": ["specs-wrapper","price-tag-motors"]})
#target_component_list = soup.findAll(["div","ul"],  {"class": "attribute-list"})
target_component = soup.findAll(["li","span","div"],  {"class": ["attribute-list","price-tag-motors"]})


#results=[]
#for i in target_component:
    #results.append(''.join(i.findAll(text=True)).replace('\n',' '))
    
results_list=[]
for i in target_component_list:
    results_list.append(''.join(i.findAll(text=True)).replace('\n\t','').replace('\t:','').replace('\n',' ')) 
    
result_list = []
result_list.
results_list.append(results_list)

append(results_list[0])
   


In [None]:
try:
    x=results_list[2].split("  ")
    x        
except AttributeError:
    pass


In [None]:
ult=[]
for i in range(0,len(results_list)):
    ult.extend(results_list[i].split("  "))
info= [i for i in ult if len(i)>0 ]    

info=Filter(info, ['Marca',"Modelo",'Kilómetros',"Año","U$S"]) 

info

In [None]:
lista = []
lista.append(x)
lista

In [None]:
#df = pd.read_csv("./data/output_Mercado.csv", sep=",", names=['carBrand','mileage',"carModel","year","price"], header=None,encoding="latin-1")
#display(df.head(10),df.shape)
#df_info=pd.concat([scrap_data,scrap_list])

df_info[["mileage","year","price"]]=df_info[["mileage","year","price"]].apply(pd.to_numeric)

In [21]:
r = requests.get("https://auto.mercadolibre.com.ec/MEC-426964946-ford-explorer-ta-automatica-_JM#position=5&type=item&tracking_id=2e718937-66e9-4834-8e6d-03266ffc58f7")

data = r.text
soup = BeautifulSoup(data, "html.parser")
#target_component_list = soup.findAll(["div","ul"],  {"class": "attribute-list"})
target_component_list = soup.findAll(["li","span","div","ul"],  {"class": ["attribute-list","price-tag-motors"]})
    
results=[]
for i in target_component_list:
    results.append(''.join(i.findAll(text=True)).replace('\n\t','').replace('\t:','').replace('\n',' '))    

ult=[]
for i in range(0,len(results)):
    ult.extend(results[i].split("  "))
info= [i for i in ult if len(i)>0 ]    

info=Filter(info, ['Marca',"Modelo",'Kilómetros',"Año","U$S"]) 
info

[' Marca BMW',
 'Modelo BMW SERIE 550i',
 'Año 2006',
 'Kilómetros 160.000 km',
 ' U$S 21.000 ']

In [None]:
target_component_list