### Libraries import

In [1]:
import pandas as pd # Dataframes management
from zipfile import ZipFile  # Files compressed management
import os # Files management along OS
import re

In [2]:
from selenium import webdriver # Webscrapping bot
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

### Data import

In [3]:
# Specifying the name of the zip file
file = "Inputs/Modificados - Atmira_Pharma_Visualization/items_ordered_2years_V2.zip"
  
# Open the zip file in read mode
with ZipFile(file, 'r') as zip: 
    # List all the contents of the zip file
    zip.printdir() 
  
    # Extract all files
    print('extraction...') 
    zip.extractall("Inputs/Modificados - Atmira_Pharma_Visualization") 
    print('Done!')

#Import CSV to pandas
itemsOrdered = pd.read_csv("Inputs/Modificados - Atmira_Pharma_Visualization/items_ordered_2years_V2.csv")
print("CSV imported to Pandas successfully")

# Remove uncompressed CSV file
os.remove("Inputs/Modificados - Atmira_Pharma_Visualization/items_ordered_2years_V2.csv")
print("Original CSV removed to preserve repo health")

File Name                                             Modified             Size
items_ordered_2years_V2.csv                    2022-02-23 01:28:04    150372498
extraction...
Done!
CSV imported to Pandas successfully
Original CSV removed to preserve repo health


In [4]:
itemsOrdered.head(3)

Unnamed: 0,num_order,item_id,created_at,product_id,qty_ordered,base_cost,price,discount_percent,customer_id,city,zipcode
0,562ba4ba9aaf2a1e926842deca19271d,2bca87c2ab10b07feef290a9be47f316,2017-01-01 00:20:33,6409.0,1.0,13.4204,17.17,8.0,50d7f99947b472cc889d58845b9d23e2,Valencia,46019
1,562ba4ba9aaf2a1e926842deca19271d,53e12ac4b579f0c4b1c4f77d50bf8792,2017-01-01 00:20:33,5133.0,1.0,18.5351,22.68,8.0,50d7f99947b472cc889d58845b9d23e2,Valencia,46019
2,562ba4ba9aaf2a1e926842deca19271d,d19d6df5cd6a1e741408b70aa2767bf4,2017-01-01 00:20:33,5125.0,1.0,18.526,22.68,8.0,50d7f99947b472cc889d58845b9d23e2,Valencia,46019


In [5]:
del zip

### Arreglos para facilitar el webscrapping

In [6]:
itemsOrdered["city"].replace(to_replace={'San Vicente del Raspeig':'Sant Vicent del Raspeig'}, inplace=True)

In [7]:
itemsOrdered.loc[itemsOrdered['zipcode'].eq('30139') & itemsOrdered['city'].eq('Murcia'), "city"] = "EL RAAL"

In [8]:
itemsOrdered.loc[itemsOrdered['zipcode'].eq('29039')] = "28039"

### USO DE WEBSCRAPPING PARA CÓDIGO POSTAL

Guardado de resultados formateados del scrapping

In [9]:
masterScrapping = []

A continuación se genera una lista compuesta de tuplas compuestas de la siguiente forma: ("Ciudad", "Zipcode")

In [10]:
rawDataZipcode = list(zip(itemsOrdered["city"].tolist(), itemsOrdered["zipcode"].tolist()))
rawDataZipcode = rawDataZipcode[:int(len(rawDataZipcode)/30)]

**Funciones destacadas**

Función que limpia los acentos con el fin de homogeneizar

In [11]:
def AcentosLimpiador(text):
	acentos = {'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'Á': 'A', 'E': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U'}
	for ele in acentos:
		if ele in text:
			text = text.replace(ele, acentos[ele])
	return text

Función que formatea los resultados del webscrapping de forma adecuada a los requerimientos necesarios

In [12]:
def zipCodeManipulation(city, zipcode, query, saved = False):
  
    # This conditional checks if the zipcode info was scrapped and stored succesfully
    if saved == False:
        listTestingZipcode = saveGeoInfo.split("\n")
        indexMatchRegex = list(map(lambda x: [(m.start(0), m.end(0)) for m in re.finditer(r"[a-z][A-Z0-9]", x)], listTestingZipcode))

        resultScrapClean = []
        for pas,resultScrap in enumerate(listTestingZipcode):
            for pos,ele in enumerate(indexMatchRegex[pas]):
                if len(indexMatchRegex[pas])==2:
                    if pos ==0:
                        txt = resultScrap[:ele[0]+1]+","+resultScrap[ele[1]-1:]
                    elif pos ==1:
                        txt = txt[:ele[0]+2]+","+txt[ele[1]:]
                        resultScrapClean.append(txt)
                elif len(indexMatchRegex[pas])==3:
                    if pos ==0:
                        txt = resultScrap[:ele[0]+1]+","+resultScrap[ele[1]-1:]
                    elif pos ==1:
                        txt = txt[:ele[0]+2]+","+txt[ele[1]:]
                    elif pos ==2:
                        txt = txt[:ele[0]+3]+","+txt[ele[1]+1:]
                        resultScrapClean.append(txt)

        resultScrapListed = [element.split(",") for element in resultScrapClean]
        resultScrapRearr = [(element[0], element[1], element[-2], element[-1]) for element in resultScrapListed]
        
        resultZip = []
        for element in resultScrapRearr:
            if element[2].lower() == AcentosLimpiador(city.lower()):
                resultZip.append(element)
                break

    elif saved ==False:
        None

    return resultZip[0]

**Webscrapping!**

In [14]:
try:
    opts = Options()
    opts.binary_location = "/usr/bin/chromium-browser"
    driver = webdriver.Chrome(options=opts)

    driver.set_page_load_timeout(5)
    driver.get("https://worldpostalcode.com/lookup")
    driver.set_page_load_timeout(5)

    for element in rawDataZipcode:
        insertZipcode =driver.find_element(By.ID,"search")
        insertZipcode.clear()
        insertZipcode.send_keys(element[1])
        clickButtonZipcode =driver.find_element(By.CLASS_NAME,"submit")
        clickButtonZipcode.click()
        driver.set_page_load_timeout(5)

        getGeoInfo = driver.find_element(By.CLASS_NAME,"search_units")
        saveGeoInfo = getGeoInfo.text

        if element[0] not in masterScrapping:
            zipCodeDef = zipCodeManipulation(element[0], element[1], saveGeoInfo)
        else:
            zipCodeDef = zipCodeManipulation(element[0], element[1], saveGeoInfo, saved = True)

        masterScrapping.append(zipCodeDef)

    driver.close()

except IndexError:
    driver.close()
    raise Exception(f"El fallo se ha producido con ciudad: {element[0]}, zipcode: {element[1]}")

**Comprobaciones del scrapping y formateo posterior realizado**

In [None]:
len(masterScrapping)

0

**Transformación del scrapeo formateado a dataframe de Pandas**

In [None]:
asd = pd.DataFrame(masterScrapping, columns=["Country","Region","City","Zipcode"])
asd.head(3)

Unnamed: 0,Country,Region,City,Zipcode
